1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
16 // operations when compiling for those targets.
17 // External include guard in highway.h - see comment there.
18 
19 #include <emmintrin.h>
20 #if HWY_TARGET == HWY_SSSE3
21 #include <tmmintrin.h>  // SSSE3
22 #else
23 #include <smmintrin.h>  // SSE4
24 #include <wmmintrin.h>  // CLMUL
25 #endif
26 #include <stddef.h>
27 #include <stdint.h>
28 
29 #include "hwy/base.h"
30 #include "hwy/ops/shared-inl.h"
31 
32 // Clang 3.9 generates VINSERTF128 instead of the desired VBROADCASTF128,
33 // which would free up port5. However, inline assembly isn't supported on
34 // MSVC, results in incorrect output on GCC 8.3, and raises "invalid output size
35 // for constraint" errors on Clang (https://gcc.godbolt.org/z/-Jt_-F), hence we
36 // disable it.
37 #ifndef HWY_LOADDUP_ASM
38 #define HWY_LOADDUP_ASM 0
39 #endif
40 
41 HWY_BEFORE_NAMESPACE();
42 namespace hwy {
43 namespace HWY_NAMESPACE {
44 
45 template <typename T>
46 using Full128 = Simd<T, 16 / sizeof(T)>;
47 
48 namespace detail {
49 
50 template <typename T>
51 struct Raw128 {
52   using type = __m128i;
53 };
54 template <>
55 struct Raw128<float> {
56   using type = __m128;
57 };
58 template <>
59 struct Raw128<double> {
60   using type = __m128d;
61 };
62 
63 }  // namespace detail
64 
65 template <typename T, size_t N = 16 / sizeof(T)>
66 class Vec128 {
67   using Raw = typename detail::Raw128<T>::type;
68 
69  public:
70   // Compound assignment. Only usable if there is a corresponding non-member
71   // binary operator overload. For example, only f32 and f64 support division.
72   HWY_INLINE Vec128& operator*=(const Vec128 other) {
73     return *this = (*this * other);
74   }
75   HWY_INLINE Vec128& operator/=(const Vec128 other) {
76     return *this = (*this / other);
77   }
78   HWY_INLINE Vec128& operator+=(const Vec128 other) {
79     return *this = (*this + other);
80   }
81   HWY_INLINE Vec128& operator-=(const Vec128 other) {
82     return *this = (*this - other);
83   }
84   HWY_INLINE Vec128& operator&=(const Vec128 other) {
85     return *this = (*this & other);
86   }
87   HWY_INLINE Vec128& operator|=(const Vec128 other) {
88     return *this = (*this | other);
89   }
90   HWY_INLINE Vec128& operator^=(const Vec128 other) {
91     return *this = (*this ^ other);
92   }
93 
94   Raw raw;
95 };
96 
97 // Forward-declare for use by DeduceD, see below.
98 template <typename T>
99 class Vec256;
100 template <typename T>
101 class Vec512;
102 
103 #if HWY_TARGET <= HWY_AVX3
104 
105 namespace detail {
106 
107 // Template arg: sizeof(lane type)
108 template <size_t size>
109 struct RawMask128 {};
110 template <>
111 struct RawMask128<1> {
112   using type = __mmask16;
113 };
114 template <>
115 struct RawMask128<2> {
116   using type = __mmask8;
117 };
118 template <>
119 struct RawMask128<4> {
120   using type = __mmask8;
121 };
122 template <>
123 struct RawMask128<8> {
124   using type = __mmask8;
125 };
126 
127 }  // namespace detail
128 
129 template <typename T, size_t N>
130 struct Mask128 {
131   using Raw = typename detail::RawMask128<sizeof(T)>::type;
132 
133   static Mask128<T, N> FromBits(uint64_t mask_bits) {
134     return Mask128<T, N>{static_cast<Raw>(mask_bits)};
135   }
136 
137   Raw raw;
138 };
139 
140 #else  // AVX2 or below
141 
142 // FF..FF or 0.
143 template <typename T, size_t N = 16 / sizeof(T)>
144 struct Mask128 {
145   typename detail::Raw128<T>::type raw;
146 };
147 
148 #endif  // HWY_TARGET <= HWY_AVX3
149 
150 namespace detail {
151 
152 // Deduce Simd<T, N> from Vec*<T, N> (pointers because Vec256/512 may be
153 // incomplete types at this point; this is simpler than avoiding multiple
154 // definitions of DFromV via #if)
155 struct DeduceD {
156   template <typename T, size_t N>
157   Simd<T, N> operator()(const Vec128<T, N>*) const {
158     return Simd<T, N>();
159   }
160   template <typename T>
161   Simd<T, 32 / sizeof(T)> operator()(const Vec256<T>*) const {
162     return Simd<T, 32 / sizeof(T)>();
163   }
164   template <typename T>
165   Simd<T, 64 / sizeof(T)> operator()(const Vec512<T>*) const {
166     return Simd<T, 64 / sizeof(T)>();
167   }
168 };
169 
170 // Workaround for MSVC v19.14: alias with a dependent type fails to specialize.
171 template <class V>
172 struct ExpandDFromV {
173   using type = decltype(DeduceD()(static_cast<V*>(nullptr)));
174 };
175 
176 }  // namespace detail
177 
178 template <class V>
179 using DFromV = typename detail::ExpandDFromV<V>::type;
180 
181 template <class V>
182 using TFromV = TFromD<DFromV<V>>;
183 
184 // ------------------------------ BitCast
185 
186 namespace detail {
187 
188 HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
189 HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
190 HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
191 
192 template <typename T, size_t N>
193 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
194   return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
195 }
196 
197 // Cannot rely on function overloading because return types differ.
198 template <typename T>
199 struct BitCastFromInteger128 {
200   HWY_INLINE __m128i operator()(__m128i v) { return v; }
201 };
202 template <>
203 struct BitCastFromInteger128<float> {
204   HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
205 };
206 template <>
207 struct BitCastFromInteger128<double> {
208   HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
209 };
210 
211 template <typename T, size_t N>
212 HWY_INLINE Vec128<T, N> BitCastFromByte(Simd<T, N> /* tag */,
213                                         Vec128<uint8_t, N * sizeof(T)> v) {
214   return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
215 }
216 
217 }  // namespace detail
218 
219 template <typename T, size_t N, typename FromT>
220 HWY_API Vec128<T, N> BitCast(Simd<T, N> d,
221                              Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
222   return detail::BitCastFromByte(d, detail::BitCastToByte(v));
223 }
224 
225 // ------------------------------ Zero
226 
227 // Returns an all-zero vector/part.
228 template <typename T, size_t N, HWY_IF_LE128(T, N)>
229 HWY_API Vec128<T, N> Zero(Simd<T, N> /* tag */) {
230   return Vec128<T, N>{_mm_setzero_si128()};
231 }
232 template <size_t N, HWY_IF_LE128(float, N)>
233 HWY_API Vec128<float, N> Zero(Simd<float, N> /* tag */) {
234   return Vec128<float, N>{_mm_setzero_ps()};
235 }
236 template <size_t N, HWY_IF_LE128(double, N)>
237 HWY_API Vec128<double, N> Zero(Simd<double, N> /* tag */) {
238   return Vec128<double, N>{_mm_setzero_pd()};
239 }
240 
241 template <class D>
242 using VFromD = decltype(Zero(D()));
243 
244 // ------------------------------ Set
245 
246 // Returns a vector/part with all lanes set to "t".
247 template <size_t N, HWY_IF_LE128(uint8_t, N)>
248 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N> /* tag */, const uint8_t t) {
249   return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
250 }
251 template <size_t N, HWY_IF_LE128(uint16_t, N)>
252 HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N> /* tag */, const uint16_t t) {
253   return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
254 }
255 template <size_t N, HWY_IF_LE128(uint32_t, N)>
256 HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N> /* tag */, const uint32_t t) {
257   return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
258 }
259 template <size_t N, HWY_IF_LE128(uint64_t, N)>
260 HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N> /* tag */, const uint64_t t) {
261   return Vec128<uint64_t, N>{
262       _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
263 }
264 template <size_t N, HWY_IF_LE128(int8_t, N)>
265 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N> /* tag */, const int8_t t) {
266   return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
267 }
268 template <size_t N, HWY_IF_LE128(int16_t, N)>
269 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N> /* tag */, const int16_t t) {
270   return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
271 }
272 template <size_t N, HWY_IF_LE128(int32_t, N)>
273 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N> /* tag */, const int32_t t) {
274   return Vec128<int32_t, N>{_mm_set1_epi32(t)};
275 }
276 template <size_t N, HWY_IF_LE128(int64_t, N)>
277 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N> /* tag */, const int64_t t) {
278   return Vec128<int64_t, N>{
279       _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
280 }
281 template <size_t N, HWY_IF_LE128(float, N)>
282 HWY_API Vec128<float, N> Set(Simd<float, N> /* tag */, const float t) {
283   return Vec128<float, N>{_mm_set1_ps(t)};
284 }
285 template <size_t N, HWY_IF_LE128(double, N)>
286 HWY_API Vec128<double, N> Set(Simd<double, N> /* tag */, const double t) {
287   return Vec128<double, N>{_mm_set1_pd(t)};
288 }
289 
290 HWY_DIAGNOSTICS(push)
291 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
292 
293 // Returns a vector with uninitialized elements.
294 template <typename T, size_t N, HWY_IF_LE128(T, N)>
295 HWY_API Vec128<T, N> Undefined(Simd<T, N> /* tag */) {
296   // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
297   // generate an XOR instruction.
298   return Vec128<T, N>{_mm_undefined_si128()};
299 }
300 template <size_t N, HWY_IF_LE128(float, N)>
301 HWY_API Vec128<float, N> Undefined(Simd<float, N> /* tag */) {
302   return Vec128<float, N>{_mm_undefined_ps()};
303 }
304 template <size_t N, HWY_IF_LE128(double, N)>
305 HWY_API Vec128<double, N> Undefined(Simd<double, N> /* tag */) {
306   return Vec128<double, N>{_mm_undefined_pd()};
307 }
308 
309 HWY_DIAGNOSTICS(pop)
310 
311 // ------------------------------ GetLane
312 
313 // Gets the single value stored in a vector/part.
314 template <size_t N>
315 HWY_API uint8_t GetLane(const Vec128<uint8_t, N> v) {
316   return static_cast<uint8_t>(_mm_cvtsi128_si32(v.raw) & 0xFF);
317 }
318 template <size_t N>
319 HWY_API int8_t GetLane(const Vec128<int8_t, N> v) {
320   return static_cast<int8_t>(_mm_cvtsi128_si32(v.raw) & 0xFF);
321 }
322 template <size_t N>
323 HWY_API uint16_t GetLane(const Vec128<uint16_t, N> v) {
324   return static_cast<uint16_t>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
325 }
326 template <size_t N>
327 HWY_API int16_t GetLane(const Vec128<int16_t, N> v) {
328   return static_cast<int16_t>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
329 }
330 template <size_t N>
331 HWY_API uint32_t GetLane(const Vec128<uint32_t, N> v) {
332   return static_cast<uint32_t>(_mm_cvtsi128_si32(v.raw));
333 }
334 template <size_t N>
335 HWY_API int32_t GetLane(const Vec128<int32_t, N> v) {
336   return _mm_cvtsi128_si32(v.raw);
337 }
338 template <size_t N>
339 HWY_API float GetLane(const Vec128<float, N> v) {
340   return _mm_cvtss_f32(v.raw);
341 }
342 template <size_t N>
343 HWY_API uint64_t GetLane(const Vec128<uint64_t, N> v) {
344 #if HWY_ARCH_X86_32
345   alignas(16) uint64_t lanes[2];
346   Store(v, Simd<uint64_t, N>(), lanes);
347   return lanes[0];
348 #else
349   return static_cast<uint64_t>(_mm_cvtsi128_si64(v.raw));
350 #endif
351 }
352 template <size_t N>
353 HWY_API int64_t GetLane(const Vec128<int64_t, N> v) {
354 #if HWY_ARCH_X86_32
355   alignas(16) int64_t lanes[2];
356   Store(v, Simd<int64_t, N>(), lanes);
357   return lanes[0];
358 #else
359   return _mm_cvtsi128_si64(v.raw);
360 #endif
361 }
362 template <size_t N>
363 HWY_API double GetLane(const Vec128<double, N> v) {
364   return _mm_cvtsd_f64(v.raw);
365 }
366 
367 // ================================================== LOGICAL
368 
369 // ------------------------------ And
370 
371 template <typename T, size_t N>
372 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
373   return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
374 }
375 template <size_t N>
376 HWY_API Vec128<float, N> And(const Vec128<float, N> a,
377                              const Vec128<float, N> b) {
378   return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)};
379 }
380 template <size_t N>
381 HWY_API Vec128<double, N> And(const Vec128<double, N> a,
382                               const Vec128<double, N> b) {
383   return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)};
384 }
385 
386 // ------------------------------ AndNot
387 
388 // Returns ~not_mask & mask.
389 template <typename T, size_t N>
390 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
391   return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
392 }
393 template <size_t N>
394 HWY_API Vec128<float, N> AndNot(const Vec128<float, N> not_mask,
395                                 const Vec128<float, N> mask) {
396   return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
397 }
398 template <size_t N>
399 HWY_API Vec128<double, N> AndNot(const Vec128<double, N> not_mask,
400                                  const Vec128<double, N> mask) {
401   return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
402 }
403 
404 // ------------------------------ Or
405 
406 template <typename T, size_t N>
407 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
408   return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
409 }
410 
411 template <size_t N>
412 HWY_API Vec128<float, N> Or(const Vec128<float, N> a,
413                             const Vec128<float, N> b) {
414   return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)};
415 }
416 template <size_t N>
417 HWY_API Vec128<double, N> Or(const Vec128<double, N> a,
418                              const Vec128<double, N> b) {
419   return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)};
420 }
421 
422 // ------------------------------ Xor
423 
424 template <typename T, size_t N>
425 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
426   return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
427 }
428 
429 template <size_t N>
430 HWY_API Vec128<float, N> Xor(const Vec128<float, N> a,
431                              const Vec128<float, N> b) {
432   return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)};
433 }
434 template <size_t N>
435 HWY_API Vec128<double, N> Xor(const Vec128<double, N> a,
436                               const Vec128<double, N> b) {
437   return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)};
438 }
439 
440 // ------------------------------ Not
441 
442 template <typename T, size_t N>
443 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
444   using TU = MakeUnsigned<T>;
445 #if HWY_TARGET <= HWY_AVX3
446   const __m128i vu = BitCast(Simd<TU, N>(), v).raw;
447   return BitCast(Simd<T, N>(),
448                  Vec128<TU, N>{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
449 #else
450   return Xor(v, BitCast(Simd<T, N>(), Vec128<TU, N>{_mm_set1_epi32(-1)}));
451 #endif
452 }
453 
454 // ------------------------------ Operator overloads (internal-only if float)
455 
456 template <typename T, size_t N>
457 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
458   return And(a, b);
459 }
460 
461 template <typename T, size_t N>
462 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
463   return Or(a, b);
464 }
465 
466 template <typename T, size_t N>
467 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
468   return Xor(a, b);
469 }
470 
471 // ------------------------------ PopulationCount
472 
473 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
474 #if HWY_TARGET == HWY_AVX3_DL
475 
476 #ifdef HWY_NATIVE_POPCNT
477 #undef HWY_NATIVE_POPCNT
478 #else
479 #define HWY_NATIVE_POPCNT
480 #endif
481 
482 namespace detail {
483 
484 template <typename T, size_t N>
485 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */,
486                                         Vec128<T, N> v) {
487   return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
488 }
489 template <typename T, size_t N>
490 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */,
491                                         Vec128<T, N> v) {
492   return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
493 }
494 template <typename T, size_t N>
495 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */,
496                                         Vec128<T, N> v) {
497   return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
498 }
499 template <typename T, size_t N>
500 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */,
501                                         Vec128<T, N> v) {
502   return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
503 }
504 
505 }  // namespace detail
506 
507 template <typename T, size_t N>
508 HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
509   return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
510 }
511 
512 #endif  // HWY_TARGET == HWY_AVX3_DL
513 
514 // ================================================== SIGN
515 
516 // ------------------------------ Neg
517 
518 template <typename T, size_t N, HWY_IF_FLOAT(T)>
519 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
520   return Xor(v, SignBit(Simd<T, N>()));
521 }
522 
523 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
524 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
525   return Zero(Simd<T, N>()) - v;
526 }
527 
528 // ------------------------------ Abs
529 
530 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
531 template <size_t N>
532 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
533 #if HWY_COMPILER_MSVC
534   // Workaround for incorrect codegen? (reaches breakpoint)
535   const auto zero = Zero(Simd<int8_t, N>());
536   return Vec128<int8_t, N>{_mm_max_epi8(v.raw, (zero - v).raw)};
537 #else
538   return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
539 #endif
540 }
541 template <size_t N>
542 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
543   return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
544 }
545 template <size_t N>
546 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
547   return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
548 }
549 // i64 is implemented after BroadcastSignBit.
550 template <size_t N>
551 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
552   const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
553   return v & BitCast(Simd<float, N>(), mask);
554 }
555 template <size_t N>
556 HWY_API Vec128<double, N> Abs(const Vec128<double, N> v) {
557   const Vec128<int64_t, N> mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
558   return v & BitCast(Simd<double, N>(), mask);
559 }
560 
561 // ------------------------------ CopySign
562 
563 template <typename T, size_t N>
564 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
565                               const Vec128<T, N> sign) {
566   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
567 
568   const Simd<T, N> d;
569   const auto msb = SignBit(d);
570 
571 #if HWY_TARGET <= HWY_AVX3
572   const Rebind<MakeUnsigned<T>, decltype(d)> du;
573   // Truth table for msb, magn, sign | bitwise msb ? sign : mag
574   //                  0    0     0   |  0
575   //                  0    0     1   |  0
576   //                  0    1     0   |  1
577   //                  0    1     1   |  1
578   //                  1    0     0   |  0
579   //                  1    0     1   |  1
580   //                  1    1     0   |  0
581   //                  1    1     1   |  1
582   // The lane size does not matter because we are not using predication.
583   const __m128i out = _mm_ternarylogic_epi32(
584       BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
585   return BitCast(d, decltype(Zero(du)){out});
586 #else
587   return Or(AndNot(msb, magn), And(msb, sign));
588 #endif
589 }
590 
591 template <typename T, size_t N>
592 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
593                                    const Vec128<T, N> sign) {
594 #if HWY_TARGET <= HWY_AVX3
595   // AVX3 can also handle abs < 0, so no extra action needed.
596   return CopySign(abs, sign);
597 #else
598   return Or(abs, And(SignBit(Simd<T, N>()), sign));
599 #endif
600 }
601 
602 // ================================================== MASK
603 
604 #if HWY_TARGET <= HWY_AVX3
605 
606 // ------------------------------ IfThenElse
607 
608 // Returns mask ? b : a.
609 
610 namespace detail {
611 
612 // Templates for signed/unsigned integer of a particular size.
613 template <typename T, size_t N>
614 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */,
615                                    Mask128<T, N> mask, Vec128<T, N> yes,
616                                    Vec128<T, N> no) {
617   return Vec128<T, N>{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
618 }
619 template <typename T, size_t N>
620 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<2> /* tag */,
621                                    Mask128<T, N> mask, Vec128<T, N> yes,
622                                    Vec128<T, N> no) {
623   return Vec128<T, N>{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
624 }
625 template <typename T, size_t N>
626 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<4> /* tag */,
627                                    Mask128<T, N> mask, Vec128<T, N> yes,
628                                    Vec128<T, N> no) {
629   return Vec128<T, N>{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
630 }
631 template <typename T, size_t N>
632 HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */,
633                                    Mask128<T, N> mask, Vec128<T, N> yes,
634                                    Vec128<T, N> no) {
635   return Vec128<T, N>{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
636 }
637 
638 }  // namespace detail
639 
640 template <typename T, size_t N>
641 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
642                                 Vec128<T, N> no) {
643   return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
644 }
645 
646 template <size_t N>
647 HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask,
648                                     Vec128<float, N> yes, Vec128<float, N> no) {
649   return Vec128<float, N>{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)};
650 }
651 
652 template <size_t N>
653 HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask,
654                                      Vec128<double, N> yes,
655                                      Vec128<double, N> no) {
656   return Vec128<double, N>{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)};
657 }
658 
659 namespace detail {
660 
661 template <typename T, size_t N>
662 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<1> /* tag */,
663                                        Mask128<T, N> mask, Vec128<T, N> yes) {
664   return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
665 }
666 template <typename T, size_t N>
667 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<2> /* tag */,
668                                        Mask128<T, N> mask, Vec128<T, N> yes) {
669   return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
670 }
671 template <typename T, size_t N>
672 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<4> /* tag */,
673                                        Mask128<T, N> mask, Vec128<T, N> yes) {
674   return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
675 }
676 template <typename T, size_t N>
677 HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */,
678                                        Mask128<T, N> mask, Vec128<T, N> yes) {
679   return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
680 }
681 
682 }  // namespace detail
683 
684 template <typename T, size_t N>
685 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
686   return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
687 }
688 
689 template <size_t N>
690 HWY_API Vec128<float, N> IfThenElseZero(Mask128<float, N> mask,
691                                         Vec128<float, N> yes) {
692   return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
693 }
694 
695 template <size_t N>
696 HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask,
697                                          Vec128<double, N> yes) {
698   return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
699 }
700 
701 namespace detail {
702 
703 template <typename T, size_t N>
704 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<1> /* tag */,
705                                        Mask128<T, N> mask, Vec128<T, N> no) {
706   // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
707   return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
708 }
709 template <typename T, size_t N>
710 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<2> /* tag */,
711                                        Mask128<T, N> mask, Vec128<T, N> no) {
712   return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
713 }
714 template <typename T, size_t N>
715 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<4> /* tag */,
716                                        Mask128<T, N> mask, Vec128<T, N> no) {
717   return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
718 }
719 template <typename T, size_t N>
720 HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
721                                        Mask128<T, N> mask, Vec128<T, N> no) {
722   return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
723 }
724 
725 }  // namespace detail
726 
727 template <typename T, size_t N>
728 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
729   return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
730 }
731 
732 template <size_t N>
733 HWY_API Vec128<float, N> IfThenZeroElse(Mask128<float, N> mask,
734                                         Vec128<float, N> no) {
735   return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
736 }
737 
738 template <size_t N>
739 HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask,
740                                          Vec128<double, N> no) {
741   return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
742 }
743 
744 // ------------------------------ Mask logical
745 
746 // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
747 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) &&         \
748     (HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
749      HWY_COMPILER_CLANG >= 800)
750 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
751 #else
752 #define HWY_COMPILER_HAS_MASK_INTRINSICS 0
753 #endif
754 
755 namespace detail {
756 
757 template <typename T, size_t N>
758 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
759                              const Mask128<T, N> b) {
760 #if HWY_COMPILER_HAS_MASK_INTRINSICS
761   return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
762 #else
763   return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)};
764 #endif
765 }
766 template <typename T, size_t N>
767 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
768                              const Mask128<T, N> b) {
769 #if HWY_COMPILER_HAS_MASK_INTRINSICS
770   return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
771 #else
772   return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
773 #endif
774 }
775 template <typename T, size_t N>
776 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
777                              const Mask128<T, N> b) {
778 #if HWY_COMPILER_HAS_MASK_INTRINSICS
779   return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
780 #else
781   return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
782 #endif
783 }
784 template <typename T, size_t N>
785 HWY_INLINE Mask128<T, N> And(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
786                              const Mask128<T, N> b) {
787 #if HWY_COMPILER_HAS_MASK_INTRINSICS
788   return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
789 #else
790   return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
791 #endif
792 }
793 
794 template <typename T, size_t N>
795 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
796                                 const Mask128<T, N> b) {
797 #if HWY_COMPILER_HAS_MASK_INTRINSICS
798   return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
799 #else
800   return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)};
801 #endif
802 }
803 template <typename T, size_t N>
804 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
805                                 const Mask128<T, N> b) {
806 #if HWY_COMPILER_HAS_MASK_INTRINSICS
807   return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
808 #else
809   return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
810 #endif
811 }
812 template <typename T, size_t N>
813 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
814                                 const Mask128<T, N> b) {
815 #if HWY_COMPILER_HAS_MASK_INTRINSICS
816   return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
817 #else
818   return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
819 #endif
820 }
821 template <typename T, size_t N>
822 HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
823                                 const Mask128<T, N> b) {
824 #if HWY_COMPILER_HAS_MASK_INTRINSICS
825   return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
826 #else
827   return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
828 #endif
829 }
830 
831 template <typename T, size_t N>
832 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
833                             const Mask128<T, N> b) {
834 #if HWY_COMPILER_HAS_MASK_INTRINSICS
835   return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
836 #else
837   return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)};
838 #endif
839 }
840 template <typename T, size_t N>
841 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
842                             const Mask128<T, N> b) {
843 #if HWY_COMPILER_HAS_MASK_INTRINSICS
844   return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
845 #else
846   return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
847 #endif
848 }
849 template <typename T, size_t N>
850 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
851                             const Mask128<T, N> b) {
852 #if HWY_COMPILER_HAS_MASK_INTRINSICS
853   return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
854 #else
855   return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
856 #endif
857 }
858 template <typename T, size_t N>
859 HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
860                             const Mask128<T, N> b) {
861 #if HWY_COMPILER_HAS_MASK_INTRINSICS
862   return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
863 #else
864   return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
865 #endif
866 }
867 
868 template <typename T, size_t N>
869 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
870                              const Mask128<T, N> b) {
871 #if HWY_COMPILER_HAS_MASK_INTRINSICS
872   return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
873 #else
874   return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)};
875 #endif
876 }
877 template <typename T, size_t N>
878 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
879                              const Mask128<T, N> b) {
880 #if HWY_COMPILER_HAS_MASK_INTRINSICS
881   return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
882 #else
883   return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
884 #endif
885 }
886 template <typename T, size_t N>
887 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
888                              const Mask128<T, N> b) {
889 #if HWY_COMPILER_HAS_MASK_INTRINSICS
890   return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
891 #else
892   return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
893 #endif
894 }
895 template <typename T, size_t N>
896 HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
897                              const Mask128<T, N> b) {
898 #if HWY_COMPILER_HAS_MASK_INTRINSICS
899   return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
900 #else
901   return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
902 #endif
903 }
904 
905 }  // namespace detail
906 
907 template <typename T, size_t N>
908 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
909   return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
910 }
911 
912 template <typename T, size_t N>
913 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
914   return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
915 }
916 
917 template <typename T, size_t N>
918 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
919   return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
920 }
921 
922 template <typename T, size_t N>
923 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
924   return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
925 }
926 
927 template <typename T, size_t N>
928 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
929   // Flip only the valid bits.
930   return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
931 }
932 
933 #else  // AVX2 or below
934 
935 // ------------------------------ Mask
936 
937 // Mask and Vec are the same (true = FF..FF).
938 template <typename T, size_t N>
939 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
940   return Mask128<T, N>{v.raw};
941 }
942 
943 template <typename T, size_t N>
944 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
945   return Vec128<T, N>{v.raw};
946 }
947 
948 template <typename T, size_t N>
949 HWY_API Vec128<T, N> VecFromMask(const Simd<T, N> /* tag */,
950                                  const Mask128<T, N> v) {
951   return Vec128<T, N>{v.raw};
952 }
953 
954 #if HWY_TARGET == HWY_SSSE3
955 
956 // mask ? yes : no
957 template <typename T, size_t N>
958 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
959                                 Vec128<T, N> no) {
960   const auto vmask = VecFromMask(Simd<T, N>(), mask);
961   return Or(And(vmask, yes), AndNot(vmask, no));
962 }
963 
964 #else  // HWY_TARGET == HWY_SSSE3
965 
966 // mask ? yes : no
967 template <typename T, size_t N>
968 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
969                                 Vec128<T, N> no) {
970   return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
971 }
972 template <size_t N>
973 HWY_API Vec128<float, N> IfThenElse(const Mask128<float, N> mask,
974                                     const Vec128<float, N> yes,
975                                     const Vec128<float, N> no) {
976   return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
977 }
978 template <size_t N>
979 HWY_API Vec128<double, N> IfThenElse(const Mask128<double, N> mask,
980                                      const Vec128<double, N> yes,
981                                      const Vec128<double, N> no) {
982   return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
983 }
984 
985 #endif  // HWY_TARGET == HWY_SSSE3
986 
987 // mask ? yes : 0
988 template <typename T, size_t N>
989 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
990   return yes & VecFromMask(Simd<T, N>(), mask);
991 }
992 
993 // mask ? 0 : no
994 template <typename T, size_t N>
995 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
996   return AndNot(VecFromMask(Simd<T, N>(), mask), no);
997 }
998 
999 // ------------------------------ Mask logical
1000 
1001 template <typename T, size_t N>
1002 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1003   return MaskFromVec(Not(VecFromMask(Simd<T, N>(), m)));
1004 }
1005 
1006 template <typename T, size_t N>
1007 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1008   const Simd<T, N> d;
1009   return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1010 }
1011 
1012 template <typename T, size_t N>
1013 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1014   const Simd<T, N> d;
1015   return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1016 }
1017 
1018 template <typename T, size_t N>
1019 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1020   const Simd<T, N> d;
1021   return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1022 }
1023 
1024 template <typename T, size_t N>
1025 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1026   const Simd<T, N> d;
1027   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1028 }
1029 
1030 #endif  // HWY_TARGET <= HWY_AVX3
1031 
1032 // ================================================== SWIZZLE (1)
1033 
1034 // ------------------------------ Hard-coded shuffles
1035 
1036 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1037 // Shuffle0321 rotates one lane to the right (the previous least-significant
1038 // lane is now most-significant). These could also be implemented via
1039 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1040 
1041 // Swap 32-bit halves in 64-bit halves.
1042 template <size_t N>
1043 HWY_API Vec128<uint32_t, N> Shuffle2301(const Vec128<uint32_t, N> v) {
1044   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1045   return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
1046 }
1047 template <size_t N>
1048 HWY_API Vec128<int32_t, N> Shuffle2301(const Vec128<int32_t, N> v) {
1049   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1050   return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
1051 }
1052 template <size_t N>
1053 HWY_API Vec128<float, N> Shuffle2301(const Vec128<float, N> v) {
1054   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1055   return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
1056 }
1057 
1058 // Swap 64-bit halves
1059 HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
1060   return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1061 }
1062 HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
1063   return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1064 }
1065 HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
1066   return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
1067 }
1068 HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) {
1069   return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1070 }
1071 HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) {
1072   return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1073 }
1074 HWY_API Vec128<double> Shuffle01(const Vec128<double> v) {
1075   return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
1076 }
1077 
1078 // Rotate right 32 bits
1079 HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
1080   return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1081 }
1082 HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
1083   return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1084 }
1085 HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
1086   return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
1087 }
1088 // Rotate left 32 bits
1089 HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
1090   return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1091 }
1092 HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
1093   return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1094 }
1095 HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
1096   return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
1097 }
1098 
1099 // Reverse
1100 HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
1101   return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1102 }
1103 HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
1104   return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1105 }
1106 HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
1107   return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
1108 }
1109 
1110 // ================================================== COMPARE
1111 
1112 #if HWY_TARGET <= HWY_AVX3
1113 
1114 // Comparisons set a mask bit to 1 if the condition is true, else 0.
1115 
1116 template <typename TFrom, size_t NFrom, typename TTo, size_t NTo>
1117 HWY_API Mask128<TTo, NTo> RebindMask(Simd<TTo, NTo> /*tag*/,
1118                                      Mask128<TFrom, NFrom> m) {
1119   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1120   return Mask128<TTo, NTo>{m.raw};
1121 }
1122 
1123 namespace detail {
1124 
1125 template <typename T, size_t N>
1126 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<1> /*tag*/, const Vec128<T, N> v,
1127                                  const Vec128<T, N> bit) {
1128   return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
1129 }
1130 template <typename T, size_t N>
1131 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<2> /*tag*/, const Vec128<T, N> v,
1132                                  const Vec128<T, N> bit) {
1133   return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
1134 }
1135 template <typename T, size_t N>
1136 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<4> /*tag*/, const Vec128<T, N> v,
1137                                  const Vec128<T, N> bit) {
1138   return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
1139 }
1140 template <typename T, size_t N>
1141 HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<8> /*tag*/, const Vec128<T, N> v,
1142                                  const Vec128<T, N> bit) {
1143   return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
1144 }
1145 
1146 }  // namespace detail
1147 
1148 template <typename T, size_t N>
1149 HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
1150   static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1151   return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1152 }
1153 
1154 // ------------------------------ Equality
1155 
1156 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1157 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1158   return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
1159 }
1160 
1161 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1162 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1163   return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1164 }
1165 
1166 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1167 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1168   return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1169 }
1170 
1171 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1172 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1173   return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1174 }
1175 
1176 template <size_t N>
1177 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
1178   return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1179 }
1180 
1181 template <size_t N>
1182 HWY_API Mask128<double, N> operator==(Vec128<double, N> a,
1183                                       Vec128<double, N> b) {
1184   return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1185 }
1186 
1187 // ------------------------------ Inequality
1188 
1189 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1190 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1191   return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
1192 }
1193 
1194 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1195 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1196   return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1197 }
1198 
1199 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1200 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1201   return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1202 }
1203 
1204 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1205 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1206   return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1207 }
1208 
1209 template <size_t N>
1210 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
1211   return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1212 }
1213 
1214 template <size_t N>
1215 HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
1216                                       Vec128<double, N> b) {
1217   return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1218 }
1219 
1220 // ------------------------------ Strict inequality
1221 
1222 // Signed/float <
1223 template <size_t N>
1224 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1225   return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1226 }
1227 template <size_t N>
1228 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1229                                       Vec128<int16_t, N> b) {
1230   return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1231 }
1232 template <size_t N>
1233 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1234                                       Vec128<int32_t, N> b) {
1235   return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1236 }
1237 template <size_t N>
1238 HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
1239                                       Vec128<int64_t, N> b) {
1240   return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1241 }
1242 
1243 template <size_t N>
1244 HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a,
1245                                       Vec128<uint8_t, N> b) {
1246   return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
1247 }
1248 template <size_t N>
1249 HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a,
1250                                        Vec128<uint16_t, N> b) {
1251   return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
1252 }
1253 template <size_t N>
1254 HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a,
1255                                        Vec128<uint32_t, N> b) {
1256   return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
1257 }
1258 template <size_t N>
1259 HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
1260                                        Vec128<uint64_t, N> b) {
1261   return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
1262 }
1263 
1264 template <size_t N>
1265 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1266   return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1267 }
1268 template <size_t N>
1269 HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
1270   return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1271 }
1272 
1273 // ------------------------------ Weak inequality
1274 
1275 template <size_t N>
1276 HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
1277   return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1278 }
1279 template <size_t N>
1280 HWY_API Mask128<double, N> operator>=(Vec128<double, N> a,
1281                                       Vec128<double, N> b) {
1282   return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1283 }
1284 
1285 // ------------------------------ Mask
1286 
1287 namespace detail {
1288 
1289 template <typename T, size_t N>
1290 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/,
1291                                      const Vec128<T, N> v) {
1292   return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
1293 }
1294 template <typename T, size_t N>
1295 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/,
1296                                      const Vec128<T, N> v) {
1297   return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
1298 }
1299 template <typename T, size_t N>
1300 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/,
1301                                      const Vec128<T, N> v) {
1302   return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
1303 }
1304 template <typename T, size_t N>
1305 HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/,
1306                                      const Vec128<T, N> v) {
1307   return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
1308 }
1309 
1310 }  // namespace detail
1311 
1312 template <typename T, size_t N>
1313 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1314   return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1315 }
1316 // There do not seem to be native floating-point versions of these instructions.
1317 template <size_t N>
1318 HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) {
1319   return Mask128<float, N>{MaskFromVec(BitCast(Simd<int32_t, N>(), v)).raw};
1320 }
1321 template <size_t N>
1322 HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) {
1323   return Mask128<double, N>{MaskFromVec(BitCast(Simd<int64_t, N>(), v)).raw};
1324 }
1325 
1326 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1327 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1328   return Vec128<T, N>{_mm_movm_epi8(v.raw)};
1329 }
1330 
1331 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1332 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1333   return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1334 }
1335 
1336 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1337 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1338   return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1339 }
1340 
1341 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1342 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1343   return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1344 }
1345 
1346 template <size_t N>
1347 HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) {
1348   return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
1349 }
1350 
1351 template <size_t N>
1352 HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) {
1353   return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
1354 }
1355 
1356 template <typename T, size_t N>
1357 HWY_API Vec128<T, N> VecFromMask(Simd<T, N> /* tag */, const Mask128<T, N> v) {
1358   return VecFromMask(v);
1359 }
1360 
1361 #else  // AVX2 or below
1362 
1363 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1364 
1365 template <typename TFrom, typename TTo, size_t N>
1366 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N> /*tag*/, Mask128<TFrom, N> m) {
1367   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1368   const Simd<TFrom, N> d;
1369   return MaskFromVec(BitCast(Simd<TTo, N>(), VecFromMask(d, m)));
1370 }
1371 
1372 template <typename T, size_t N>
1373 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1374   static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1375   return (v & bit) == bit;
1376 }
1377 
1378 // ------------------------------ Equality
1379 
1380 // Unsigned
1381 template <size_t N>
1382 HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
1383                                        const Vec128<uint8_t, N> b) {
1384   return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1385 }
1386 template <size_t N>
1387 HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
1388                                         const Vec128<uint16_t, N> b) {
1389   return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1390 }
1391 template <size_t N>
1392 HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
1393                                         const Vec128<uint32_t, N> b) {
1394   return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1395 }
1396 template <size_t N>
1397 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
1398                                         const Vec128<uint64_t, N> b) {
1399 #if HWY_TARGET == HWY_SSSE3
1400   const Simd<uint32_t, N * 2> d32;
1401   const Simd<uint64_t, N> d64;
1402   const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1403   const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1404   return MaskFromVec(BitCast(d64, cmp64));
1405 #else
1406   return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1407 #endif
1408 }
1409 
1410 // Signed
1411 template <size_t N>
1412 HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
1413                                       const Vec128<int8_t, N> b) {
1414   return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1415 }
1416 template <size_t N>
1417 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
1418                                        Vec128<int16_t, N> b) {
1419   return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1420 }
1421 template <size_t N>
1422 HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
1423                                        const Vec128<int32_t, N> b) {
1424   return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1425 }
1426 template <size_t N>
1427 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
1428                                        const Vec128<int64_t, N> b) {
1429   // Same as signed ==; avoid duplicating the SSSE3 version.
1430   const Simd<uint64_t, N> du;
1431   return RebindMask(Simd<int64_t, N>(), BitCast(du, a) == BitCast(du, b));
1432 }
1433 
1434 // Float
1435 template <size_t N>
1436 HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
1437                                      const Vec128<float, N> b) {
1438   return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1439 }
1440 template <size_t N>
1441 HWY_API Mask128<double, N> operator==(const Vec128<double, N> a,
1442                                       const Vec128<double, N> b) {
1443   return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1444 }
1445 
1446 // ------------------------------ Inequality
1447 
1448 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1449 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1450   return Not(a == b);
1451 }
1452 
1453 template <size_t N>
1454 HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
1455                                      const Vec128<float, N> b) {
1456   return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1457 }
1458 template <size_t N>
1459 HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
1460                                       const Vec128<double, N> b) {
1461   return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1462 }
1463 
1464 // ------------------------------ Strict inequality
1465 
1466 // Signed/float <
1467 template <size_t N>
1468 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1469   return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1470 }
1471 template <size_t N>
1472 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1473                                       Vec128<int16_t, N> b) {
1474   return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1475 }
1476 template <size_t N>
1477 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1478                                       Vec128<int32_t, N> b) {
1479   return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1480 }
1481 
1482 template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
1483 HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
1484   const Simd<T, N> du;
1485   const RebindToSigned<decltype(du)> di;
1486   const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1487   return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
1488 }
1489 
1490 template <size_t N>
1491 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1492   return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1493 }
1494 template <size_t N>
1495 HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
1496   return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1497 }
1498 
1499 template <size_t N>
1500 HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
1501                                       const Vec128<int64_t, N> b) {
1502 #if HWY_TARGET == HWY_SSSE3
1503   // If the upper half is less than or greater, this is the answer.
1504   const __m128i m_gt = _mm_cmpgt_epi32(a.raw, b.raw);
1505 
1506   // Otherwise, the lower half decides.
1507   const __m128i m_eq = _mm_cmpeq_epi32(a.raw, b.raw);
1508   const __m128i lo_in_hi = _mm_shuffle_epi32(m_gt, _MM_SHUFFLE(2, 2, 0, 0));
1509   const __m128i lo_gt = _mm_and_si128(m_eq, lo_in_hi);
1510 
1511   const __m128i gt = _mm_or_si128(lo_gt, m_gt);
1512   // Copy result in upper 32 bits to lower 32 bits.
1513   return Mask128<int64_t, N>{_mm_shuffle_epi32(gt, _MM_SHUFFLE(3, 3, 1, 1))};
1514 #else
1515   return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};  // SSE4.2
1516 #endif
1517 }
1518 
1519 // ------------------------------ Weak inequality
1520 
1521 template <size_t N>
1522 HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
1523                                      const Vec128<float, N> b) {
1524   return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1525 }
1526 template <size_t N>
1527 HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a,
1528                                       const Vec128<double, N> b) {
1529   return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1530 }
1531 
1532 #endif  // HWY_TARGET <= HWY_AVX3
1533 
1534 // ------------------------------ Reversed comparisons
1535 
1536 template <typename T, size_t N>
1537 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
1538   return b > a;
1539 }
1540 
1541 template <typename T, size_t N>
1542 HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
1543   return b >= a;
1544 }
1545 
1546 // ------------------------------ FirstN (Iota, Lt)
1547 
1548 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1549 HWY_API Mask128<T, N> FirstN(const Simd<T, N> d, size_t num) {
1550 #if HWY_TARGET <= HWY_AVX3
1551   (void)d;
1552   const uint64_t all = (1ull << N) - 1;
1553   // BZHI only looks at the lower 8 bits of num!
1554   const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num);
1555   return Mask128<T, N>::FromBits(bits);
1556 #else
1557   const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
1558   return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1559 #endif
1560 }
1561 
1562 template <class D>
1563 using MFromD = decltype(FirstN(D(), 0));
1564 
1565 // ================================================== MEMORY (1)
1566 
1567 // Clang static analysis claims the memory immediately after a partial vector
1568 // store is uninitialized, and also flags the input to partial loads (at least
1569 // for loadl_pd) as "garbage". This is a false alarm because msan does not
1570 // raise errors. We work around this by using CopyBytes instead of intrinsics,
1571 // but only for the analyzer to avoid potentially bad code generation.
1572 // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
1573 #ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1574 #if defined(__clang_analyzer__) || \
1575     (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1576 #define HWY_SAFE_PARTIAL_LOAD_STORE 1
1577 #else
1578 #define HWY_SAFE_PARTIAL_LOAD_STORE 0
1579 #endif
1580 #endif  // HWY_SAFE_PARTIAL_LOAD_STORE
1581 
1582 // ------------------------------ Load
1583 
1584 template <typename T>
1585 HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1586   return Vec128<T>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
1587 }
1588 HWY_API Vec128<float> Load(Full128<float> /* tag */,
1589                            const float* HWY_RESTRICT aligned) {
1590   return Vec128<float>{_mm_load_ps(aligned)};
1591 }
1592 HWY_API Vec128<double> Load(Full128<double> /* tag */,
1593                             const double* HWY_RESTRICT aligned) {
1594   return Vec128<double>{_mm_load_pd(aligned)};
1595 }
1596 
1597 template <typename T>
1598 HWY_API Vec128<T> LoadU(Full128<T> /* tag */, const T* HWY_RESTRICT p) {
1599   return Vec128<T>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
1600 }
1601 HWY_API Vec128<float> LoadU(Full128<float> /* tag */,
1602                             const float* HWY_RESTRICT p) {
1603   return Vec128<float>{_mm_loadu_ps(p)};
1604 }
1605 HWY_API Vec128<double> LoadU(Full128<double> /* tag */,
1606                              const double* HWY_RESTRICT p) {
1607   return Vec128<double>{_mm_loadu_pd(p)};
1608 }
1609 
1610 template <typename T>
1611 HWY_API Vec128<T, 8 / sizeof(T)> Load(Simd<T, 8 / sizeof(T)> /* tag */,
1612                                       const T* HWY_RESTRICT p) {
1613 #if HWY_SAFE_PARTIAL_LOAD_STORE
1614   __m128i v = _mm_setzero_si128();
1615   CopyBytes<8>(p, &v);
1616   return Vec128<T, 8 / sizeof(T)>{v};
1617 #else
1618   return Vec128<T, 8 / sizeof(T)>{
1619       _mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
1620 #endif
1621 }
1622 
1623 HWY_API Vec128<float, 2> Load(Simd<float, 2> /* tag */,
1624                               const float* HWY_RESTRICT p) {
1625 #if HWY_SAFE_PARTIAL_LOAD_STORE
1626   __m128 v = _mm_setzero_ps();
1627   CopyBytes<8>(p, &v);
1628   return Vec128<float, 2>{v};
1629 #else
1630   const __m128 hi = _mm_setzero_ps();
1631   return Vec128<float, 2>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
1632 #endif
1633 }
1634 
1635 HWY_API Vec128<double, 1> Load(Simd<double, 1> /* tag */,
1636                                const double* HWY_RESTRICT p) {
1637 #if HWY_SAFE_PARTIAL_LOAD_STORE
1638   __m128d v = _mm_setzero_pd();
1639   CopyBytes<8>(p, &v);
1640   return Vec128<double, 1>{v};
1641 #else
1642   return Vec128<double, 1>{_mm_load_sd(p)};
1643 #endif
1644 }
1645 
1646 HWY_API Vec128<float, 1> Load(Simd<float, 1> /* tag */,
1647                               const float* HWY_RESTRICT p) {
1648 #if HWY_SAFE_PARTIAL_LOAD_STORE
1649   __m128 v = _mm_setzero_ps();
1650   CopyBytes<4>(p, &v);
1651   return Vec128<float, 1>{v};
1652 #else
1653   return Vec128<float, 1>{_mm_load_ss(p)};
1654 #endif
1655 }
1656 
1657 // Any <= 32 bit except <float, 1>
1658 template <typename T, size_t N, HWY_IF_LE32(T, N)>
1659 HWY_API Vec128<T, N> Load(Simd<T, N> /* tag */, const T* HWY_RESTRICT p) {
1660   constexpr size_t kSize = sizeof(T) * N;
1661 #if HWY_SAFE_PARTIAL_LOAD_STORE
1662   __m128 v = _mm_setzero_ps();
1663   CopyBytes<kSize>(p, &v);
1664   return Vec128<T, N>{v};
1665 #else
1666   int32_t bits;
1667   CopyBytes<kSize>(p, &bits);
1668   return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
1669 #endif
1670 }
1671 
1672 // For < 128 bit, LoadU == Load.
1673 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1674 HWY_API Vec128<T, N> LoadU(Simd<T, N> d, const T* HWY_RESTRICT p) {
1675   return Load(d, p);
1676 }
1677 
1678 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1679 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1680 HWY_API Vec128<T, N> LoadDup128(Simd<T, N> d, const T* HWY_RESTRICT p) {
1681   return LoadU(d, p);
1682 }
1683 
1684 // Returns a vector with lane i=[0, N) set to "first" + i.
1685 template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
1686 HWY_API Vec128<T, N> Iota(const Simd<T, N> d, const T2 first) {
1687   HWY_ALIGN T lanes[16 / sizeof(T)];
1688   for (size_t i = 0; i < 16 / sizeof(T); ++i) {
1689     lanes[i] = static_cast<T>(first + static_cast<T2>(i));
1690   }
1691   return Load(d, lanes);
1692 }
1693 
1694 // ------------------------------ MaskedLoad
1695 
1696 #if HWY_TARGET <= HWY_AVX3
1697 
1698 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1699 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */,
1700                                 const T* HWY_RESTRICT aligned) {
1701   return Vec128<T, N>{_mm_maskz_load_epi32(m.raw, aligned)};
1702 }
1703 
1704 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1705 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */,
1706                                 const T* HWY_RESTRICT aligned) {
1707   return Vec128<T, N>{_mm_maskz_load_epi64(m.raw, aligned)};
1708 }
1709 
1710 template <size_t N>
1711 HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m,
1712                                     Simd<float, N> /* tag */,
1713                                     const float* HWY_RESTRICT aligned) {
1714   return Vec128<float, N>{_mm_maskz_load_ps(m.raw, aligned)};
1715 }
1716 
1717 template <size_t N>
1718 HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m,
1719                                      Simd<double, N> /* tag */,
1720                                      const double* HWY_RESTRICT aligned) {
1721   return Vec128<double, N>{_mm_maskz_load_pd(m.raw, aligned)};
1722 }
1723 
1724 // There is no load_epi8/16, so use loadu instead.
1725 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1726 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */,
1727                                 const T* HWY_RESTRICT aligned) {
1728   return Vec128<T, N>{_mm_maskz_loadu_epi8(m.raw, aligned)};
1729 }
1730 
1731 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1732 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */,
1733                                 const T* HWY_RESTRICT aligned) {
1734   return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, aligned)};
1735 }
1736 
1737 #elif HWY_TARGET == HWY_AVX2
1738 
1739 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1740 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */,
1741                                 const T* HWY_RESTRICT aligned) {
1742   auto aligned_p = reinterpret_cast<const int*>(aligned);  // NOLINT
1743   return Vec128<T, N>{_mm_maskload_epi32(aligned_p, m.raw)};
1744 }
1745 
1746 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1747 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> /* tag */,
1748                                 const T* HWY_RESTRICT aligned) {
1749   auto aligned_p = reinterpret_cast<const long long*>(aligned);  // NOLINT
1750   return Vec128<T, N>{_mm_maskload_epi64(aligned_p, m.raw)};
1751 }
1752 
1753 template <size_t N>
1754 HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m, Simd<float, N> d,
1755                                     const float* HWY_RESTRICT aligned) {
1756   const Vec128<int32_t, N> mi =
1757       BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
1758   return Vec128<float, N>{_mm_maskload_ps(aligned, mi.raw)};
1759 }
1760 
1761 template <size_t N>
1762 HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m, Simd<double, N> d,
1763                                      const double* HWY_RESTRICT aligned) {
1764   const Vec128<int64_t, N> mi =
1765       BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
1766   return Vec128<double, N>{_mm_maskload_pd(aligned, mi.raw)};
1767 }
1768 
1769 // There is no maskload_epi8/16, so blend instead.
1770 template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
1771 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> d,
1772                                 const T* HWY_RESTRICT aligned) {
1773   return IfThenElseZero(m, Load(d, aligned));
1774 }
1775 
1776 #else  // <= SSE4
1777 
1778 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
1779 template <typename T, size_t N>
1780 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N> d,
1781                                 const T* HWY_RESTRICT aligned) {
1782   return IfThenElseZero(m, Load(d, aligned));
1783 }
1784 
1785 #endif
1786 
1787 // ------------------------------ Store
1788 
1789 template <typename T>
1790 HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
1791   _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
1792 }
1793 HWY_API void Store(const Vec128<float> v, Full128<float> /* tag */,
1794                    float* HWY_RESTRICT aligned) {
1795   _mm_store_ps(aligned, v.raw);
1796 }
1797 HWY_API void Store(const Vec128<double> v, Full128<double> /* tag */,
1798                    double* HWY_RESTRICT aligned) {
1799   _mm_store_pd(aligned, v.raw);
1800 }
1801 
1802 template <typename T>
1803 HWY_API void StoreU(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT p) {
1804   _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
1805 }
1806 HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */,
1807                     float* HWY_RESTRICT p) {
1808   _mm_storeu_ps(p, v.raw);
1809 }
1810 HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
1811                     double* HWY_RESTRICT p) {
1812   _mm_storeu_pd(p, v.raw);
1813 }
1814 
1815 template <typename T>
1816 HWY_API void Store(Vec128<T, 8 / sizeof(T)> v, Simd<T, 8 / sizeof(T)> /* tag */,
1817                    T* HWY_RESTRICT p) {
1818 #if HWY_SAFE_PARTIAL_LOAD_STORE
1819   CopyBytes<8>(&v, p);
1820 #else
1821   _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
1822 #endif
1823 }
1824 HWY_API void Store(const Vec128<float, 2> v, Simd<float, 2> /* tag */,
1825                    float* HWY_RESTRICT p) {
1826 #if HWY_SAFE_PARTIAL_LOAD_STORE
1827   CopyBytes<8>(&v, p);
1828 #else
1829   _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
1830 #endif
1831 }
1832 HWY_API void Store(const Vec128<double, 1> v, Simd<double, 1> /* tag */,
1833                    double* HWY_RESTRICT p) {
1834 #if HWY_SAFE_PARTIAL_LOAD_STORE
1835   CopyBytes<8>(&v, p);
1836 #else
1837   _mm_storel_pd(p, v.raw);
1838 #endif
1839 }
1840 
1841 // Any <= 32 bit except <float, 1>
1842 template <typename T, size_t N, HWY_IF_LE32(T, N)>
1843 HWY_API void Store(Vec128<T, N> v, Simd<T, N> /* tag */, T* HWY_RESTRICT p) {
1844   CopyBytes<sizeof(T) * N>(&v, p);
1845 }
1846 HWY_API void Store(const Vec128<float, 1> v, Simd<float, 1> /* tag */,
1847                    float* HWY_RESTRICT p) {
1848 #if HWY_SAFE_PARTIAL_LOAD_STORE
1849   CopyBytes<4>(&v, p);
1850 #else
1851   _mm_store_ss(p, v.raw);
1852 #endif
1853 }
1854 
1855 // For < 128 bit, StoreU == Store.
1856 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1857 HWY_API void StoreU(const Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT p) {
1858   Store(v, d, p);
1859 }
1860 
1861 // ================================================== ARITHMETIC
1862 
1863 // ------------------------------ Addition
1864 
1865 // Unsigned
1866 template <size_t N>
1867 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
1868                                      const Vec128<uint8_t, N> b) {
1869   return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
1870 }
1871 template <size_t N>
1872 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
1873                                       const Vec128<uint16_t, N> b) {
1874   return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
1875 }
1876 template <size_t N>
1877 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
1878                                       const Vec128<uint32_t, N> b) {
1879   return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
1880 }
1881 template <size_t N>
1882 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
1883                                       const Vec128<uint64_t, N> b) {
1884   return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
1885 }
1886 
1887 // Signed
1888 template <size_t N>
1889 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
1890                                     const Vec128<int8_t, N> b) {
1891   return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
1892 }
1893 template <size_t N>
1894 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
1895                                      const Vec128<int16_t, N> b) {
1896   return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
1897 }
1898 template <size_t N>
1899 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
1900                                      const Vec128<int32_t, N> b) {
1901   return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
1902 }
1903 template <size_t N>
1904 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
1905                                      const Vec128<int64_t, N> b) {
1906   return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
1907 }
1908 
1909 // Float
1910 template <size_t N>
1911 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
1912                                    const Vec128<float, N> b) {
1913   return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
1914 }
1915 template <size_t N>
1916 HWY_API Vec128<double, N> operator+(const Vec128<double, N> a,
1917                                     const Vec128<double, N> b) {
1918   return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
1919 }
1920 
1921 // ------------------------------ Subtraction
1922 
1923 // Unsigned
1924 template <size_t N>
1925 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
1926                                      const Vec128<uint8_t, N> b) {
1927   return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
1928 }
1929 template <size_t N>
1930 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
1931                                       Vec128<uint16_t, N> b) {
1932   return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
1933 }
1934 template <size_t N>
1935 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
1936                                       const Vec128<uint32_t, N> b) {
1937   return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
1938 }
1939 template <size_t N>
1940 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
1941                                       const Vec128<uint64_t, N> b) {
1942   return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
1943 }
1944 
1945 // Signed
1946 template <size_t N>
1947 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
1948                                     const Vec128<int8_t, N> b) {
1949   return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
1950 }
1951 template <size_t N>
1952 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
1953                                      const Vec128<int16_t, N> b) {
1954   return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
1955 }
1956 template <size_t N>
1957 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
1958                                      const Vec128<int32_t, N> b) {
1959   return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
1960 }
1961 template <size_t N>
1962 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
1963                                      const Vec128<int64_t, N> b) {
1964   return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
1965 }
1966 
1967 // Float
1968 template <size_t N>
1969 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
1970                                    const Vec128<float, N> b) {
1971   return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
1972 }
1973 template <size_t N>
1974 HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
1975                                     const Vec128<double, N> b) {
1976   return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
1977 }
1978 
1979 // ------------------------------ Saturating addition
1980 
1981 // Returns a + b clamped to the destination range.
1982 
1983 // Unsigned
1984 template <size_t N>
1985 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
1986                                         const Vec128<uint8_t, N> b) {
1987   return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
1988 }
1989 template <size_t N>
1990 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
1991                                          const Vec128<uint16_t, N> b) {
1992   return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
1993 }
1994 
1995 // Signed
1996 template <size_t N>
1997 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
1998                                        const Vec128<int8_t, N> b) {
1999   return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
2000 }
2001 template <size_t N>
2002 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
2003                                         const Vec128<int16_t, N> b) {
2004   return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
2005 }
2006 
2007 // ------------------------------ Saturating subtraction
2008 
2009 // Returns a - b clamped to the destination range.
2010 
2011 // Unsigned
2012 template <size_t N>
2013 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
2014                                         const Vec128<uint8_t, N> b) {
2015   return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
2016 }
2017 template <size_t N>
2018 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
2019                                          const Vec128<uint16_t, N> b) {
2020   return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
2021 }
2022 
2023 // Signed
2024 template <size_t N>
2025 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
2026                                        const Vec128<int8_t, N> b) {
2027   return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
2028 }
2029 template <size_t N>
2030 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
2031                                         const Vec128<int16_t, N> b) {
2032   return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
2033 }
2034 
2035 // ------------------------------ AverageRound
2036 
2037 // Returns (a + b + 1) / 2
2038 
2039 // Unsigned
2040 template <size_t N>
2041 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
2042                                         const Vec128<uint8_t, N> b) {
2043   return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
2044 }
2045 template <size_t N>
2046 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
2047                                          const Vec128<uint16_t, N> b) {
2048   return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
2049 }
2050 
2051 // ------------------------------ Integer multiplication
2052 
2053 template <size_t N>
2054 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
2055                                       const Vec128<uint16_t, N> b) {
2056   return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2057 }
2058 template <size_t N>
2059 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
2060                                      const Vec128<int16_t, N> b) {
2061   return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2062 }
2063 
2064 // Returns the upper 16 bits of a * b in each lane.
2065 template <size_t N>
2066 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
2067                                     const Vec128<uint16_t, N> b) {
2068   return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
2069 }
2070 template <size_t N>
2071 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
2072                                    const Vec128<int16_t, N> b) {
2073   return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
2074 }
2075 
2076 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
2077 // even and the upper half into its odd neighbor lane.
2078 template <size_t N>
2079 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
2080                                               const Vec128<uint32_t, N> b) {
2081   return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2082 }
2083 
2084 #if HWY_TARGET == HWY_SSSE3
2085 
2086 template <size_t N, HWY_IF_LE64(int32_t, N)>  // N=1 or 2
2087 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2088                                              const Vec128<int32_t, N> b) {
2089   return Set(Simd<int64_t, (N + 1) / 2>(), int64_t(GetLane(a)) * GetLane(b));
2090 }
2091 HWY_API Vec128<int64_t> MulEven(const Vec128<int32_t> a,
2092                                 const Vec128<int32_t> b) {
2093   alignas(16) int32_t a_lanes[4];
2094   alignas(16) int32_t b_lanes[4];
2095   const Full128<int32_t> di32;
2096   Store(a, di32, a_lanes);
2097   Store(b, di32, b_lanes);
2098   alignas(16) int64_t mul[2];
2099   mul[0] = int64_t(a_lanes[0]) * b_lanes[0];
2100   mul[1] = int64_t(a_lanes[2]) * b_lanes[2];
2101   return Load(Full128<int64_t>(), mul);
2102 }
2103 
2104 #else  // HWY_TARGET == HWY_SSSE3
2105 
2106 template <size_t N>
2107 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2108                                              const Vec128<int32_t, N> b) {
2109   return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2110 }
2111 
2112 #endif  // HWY_TARGET == HWY_SSSE3
2113 
2114 template <size_t N>
2115 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
2116                                       const Vec128<uint32_t, N> b) {
2117 #if HWY_TARGET == HWY_SSSE3
2118   // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
2119   // 64-bit right shift would also work but also needs port 5, so no benefit.
2120   // Notation: x=don't care, z=0.
2121   const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2122   const auto mullo_x2x0 = MulEven(a, b);
2123   const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2124   const auto mullo_x3x1 =
2125       MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2126   // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
2127   // the latter requires one more instruction or a constant.
2128   const __m128i mul_20 =
2129       _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2130   const __m128i mul_31 =
2131       _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2132   return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2133 #else
2134   return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2135 #endif
2136 }
2137 
2138 template <size_t N>
2139 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
2140                                      const Vec128<int32_t, N> b) {
2141   // Same as unsigned; avoid duplicating the SSSE3 code.
2142   const Simd<uint32_t, N> du;
2143   return BitCast(Simd<int32_t, N>(), BitCast(du, a) * BitCast(du, b));
2144 }
2145 
2146 // ------------------------------ ShiftLeft
2147 
2148 template <int kBits, size_t N>
2149 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
2150   return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
2151 }
2152 
2153 template <int kBits, size_t N>
2154 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
2155   return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
2156 }
2157 
2158 template <int kBits, size_t N>
2159 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
2160   return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
2161 }
2162 
2163 template <int kBits, size_t N>
2164 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
2165   return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
2166 }
2167 template <int kBits, size_t N>
2168 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
2169   return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
2170 }
2171 template <int kBits, size_t N>
2172 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
2173   return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
2174 }
2175 
2176 template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2177 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
2178   const Simd<T, N> d8;
2179   // Use raw instead of BitCast to support N=1.
2180   const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
2181   return kBits == 1
2182              ? (v + v)
2183              : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
2184 }
2185 
2186 // ------------------------------ ShiftRight
2187 
2188 template <int kBits, size_t N>
2189 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
2190   return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
2191 }
2192 template <int kBits, size_t N>
2193 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
2194   return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
2195 }
2196 template <int kBits, size_t N>
2197 HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
2198   return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
2199 }
2200 
2201 template <int kBits, size_t N>
2202 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
2203   const Simd<uint8_t, N> d8;
2204   // Use raw instead of BitCast to support N=1.
2205   const Vec128<uint8_t, N> shifted{
2206       ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
2207   return shifted & Set(d8, 0xFF >> kBits);
2208 }
2209 
2210 template <int kBits, size_t N>
2211 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
2212   return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
2213 }
2214 template <int kBits, size_t N>
2215 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
2216   return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
2217 }
2218 
2219 template <int kBits, size_t N>
2220 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
2221   const Simd<int8_t, N> di;
2222   const Simd<uint8_t, N> du;
2223   const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2224   const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
2225   return (shifted ^ shifted_sign) - shifted_sign;
2226 }
2227 
2228 // i64 is implemented after BroadcastSignBit.
2229 
2230 // ------------------------------ RotateRight (ShiftRight, Or)
2231 
2232 template <int kBits, size_t N>
2233 HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) {
2234   static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
2235 #if HWY_TARGET <= HWY_AVX3
2236   return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)};
2237 #else
2238   if (kBits == 0) return v;
2239   return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
2240 #endif
2241 }
2242 
2243 template <int kBits, size_t N>
2244 HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
2245   static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
2246 #if HWY_TARGET <= HWY_AVX3
2247   return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)};
2248 #else
2249   if (kBits == 0) return v;
2250   return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
2251 #endif
2252 }
2253 
2254 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2255 
2256 template <size_t N>
2257 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
2258   return VecFromMask(v < Zero(Simd<int8_t, N>()));
2259 }
2260 
2261 template <size_t N>
2262 HWY_API Vec128<int16_t, N> BroadcastSignBit(const Vec128<int16_t, N> v) {
2263   return ShiftRight<15>(v);
2264 }
2265 
2266 template <size_t N>
2267 HWY_API Vec128<int32_t, N> BroadcastSignBit(const Vec128<int32_t, N> v) {
2268   return ShiftRight<31>(v);
2269 }
2270 
2271 template <size_t N>
2272 HWY_API Vec128<int64_t, N> BroadcastSignBit(const Vec128<int64_t, N> v) {
2273 #if HWY_TARGET <= HWY_AVX3
2274   return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
2275 #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2276   return VecFromMask(v < Zero(Simd<int64_t, N>()));
2277 #else
2278   // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
2279   // avoids generating a zero.
2280   const Simd<int32_t, N * 2> d32;
2281   const auto sign = ShiftRight<31>(BitCast(d32, v));
2282   return Vec128<int64_t, N>{
2283       _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2284 #endif
2285 }
2286 
2287 template <size_t N>
2288 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
2289 #if HWY_TARGET <= HWY_AVX3
2290   return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
2291 #else
2292   const auto zero = Zero(Simd<int64_t, N>());
2293   return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2294 #endif
2295 }
2296 
2297 template <int kBits, size_t N>
2298 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
2299 #if HWY_TARGET <= HWY_AVX3
2300   return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, kBits)};
2301 #else
2302   const Simd<int64_t, N> di;
2303   const Simd<uint64_t, N> du;
2304   const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2305   const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
2306   return right | sign;
2307 #endif
2308 }
2309 
2310 // ------------------------------ ZeroIfNegative (BroadcastSignBit)
2311 template <typename T, size_t N, HWY_IF_FLOAT(T)>
2312 HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
2313   const Simd<T, N> d;
2314 #if HWY_TARGET == HWY_SSSE3
2315   const RebindToSigned<decltype(d)> di;
2316   const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2317 #else
2318   const auto mask = MaskFromVec(v);  // MSB is sufficient for BLENDVPS
2319 #endif
2320   return IfThenElse(mask, Zero(d), v);
2321 }
2322 
2323 // ------------------------------ ShiftLeftSame
2324 
2325 template <size_t N>
2326 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
2327                                           const int bits) {
2328   return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2329 }
2330 template <size_t N>
2331 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
2332                                           const int bits) {
2333   return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2334 }
2335 template <size_t N>
2336 HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
2337                                           const int bits) {
2338   return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2339 }
2340 
2341 template <size_t N>
2342 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
2343                                          const int bits) {
2344   return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2345 }
2346 
2347 template <size_t N>
2348 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
2349                                          const int bits) {
2350   return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2351 }
2352 
2353 template <size_t N>
2354 HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
2355                                          const int bits) {
2356   return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2357 }
2358 
2359 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2360 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
2361   const Simd<T, N> d8;
2362   // Use raw instead of BitCast to support N=1.
2363   const Vec128<T, N> shifted{
2364       ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
2365   return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
2366 }
2367 
2368 // ------------------------------ ShiftRightSame (BroadcastSignBit)
2369 
2370 template <size_t N>
2371 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
2372                                            const int bits) {
2373   return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2374 }
2375 template <size_t N>
2376 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
2377                                            const int bits) {
2378   return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2379 }
2380 template <size_t N>
2381 HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
2382                                            const int bits) {
2383   return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2384 }
2385 
2386 template <size_t N>
2387 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
2388                                           const int bits) {
2389   const Simd<uint8_t, N> d8;
2390   // Use raw instead of BitCast to support N=1.
2391   const Vec128<uint8_t, N> shifted{
2392       ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
2393   return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
2394 }
2395 
2396 template <size_t N>
2397 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
2398                                           const int bits) {
2399   return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2400 }
2401 
2402 template <size_t N>
2403 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
2404                                           const int bits) {
2405   return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2406 }
2407 template <size_t N>
2408 HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
2409                                           const int bits) {
2410 #if HWY_TARGET <= HWY_AVX3
2411   return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2412 #else
2413   const Simd<int64_t, N> di;
2414   const Simd<uint64_t, N> du;
2415   const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2416   const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
2417   return right | sign;
2418 #endif
2419 }
2420 
2421 template <size_t N>
2422 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
2423   const Simd<int8_t, N> di;
2424   const Simd<uint8_t, N> du;
2425   const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2426   const auto shifted_sign =
2427       BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
2428   return (shifted ^ shifted_sign) - shifted_sign;
2429 }
2430 
2431 // ------------------------------ Floating-point mul / div
2432 
2433 template <size_t N>
2434 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
2435   return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2436 }
2437 HWY_API Vec128<float, 1> operator*(const Vec128<float, 1> a,
2438                                    const Vec128<float, 1> b) {
2439   return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
2440 }
2441 template <size_t N>
2442 HWY_API Vec128<double, N> operator*(const Vec128<double, N> a,
2443                                     const Vec128<double, N> b) {
2444   return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
2445 }
2446 HWY_API Vec128<double, 1> operator*(const Vec128<double, 1> a,
2447                                     const Vec128<double, 1> b) {
2448   return Vec128<double, 1>{_mm_mul_sd(a.raw, b.raw)};
2449 }
2450 
2451 template <size_t N>
2452 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
2453                                    const Vec128<float, N> b) {
2454   return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2455 }
2456 HWY_API Vec128<float, 1> operator/(const Vec128<float, 1> a,
2457                                    const Vec128<float, 1> b) {
2458   return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
2459 }
2460 template <size_t N>
2461 HWY_API Vec128<double, N> operator/(const Vec128<double, N> a,
2462                                     const Vec128<double, N> b) {
2463   return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
2464 }
2465 HWY_API Vec128<double, 1> operator/(const Vec128<double, 1> a,
2466                                     const Vec128<double, 1> b) {
2467   return Vec128<double, 1>{_mm_div_sd(a.raw, b.raw)};
2468 }
2469 
2470 // Approximate reciprocal
2471 template <size_t N>
2472 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
2473   return Vec128<float, N>{_mm_rcp_ps(v.raw)};
2474 }
2475 HWY_API Vec128<float, 1> ApproximateReciprocal(const Vec128<float, 1> v) {
2476   return Vec128<float, 1>{_mm_rcp_ss(v.raw)};
2477 }
2478 
2479 // Absolute value of difference.
2480 template <size_t N>
2481 HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
2482                                  const Vec128<float, N> b) {
2483   return Abs(a - b);
2484 }
2485 
2486 // ------------------------------ Floating-point multiply-add variants
2487 
2488 // Returns mul * x + add
2489 template <size_t N>
2490 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
2491                                 const Vec128<float, N> x,
2492                                 const Vec128<float, N> add) {
2493 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2494   return mul * x + add;
2495 #else
2496   return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2497 #endif
2498 }
2499 template <size_t N>
2500 HWY_API Vec128<double, N> MulAdd(const Vec128<double, N> mul,
2501                                  const Vec128<double, N> x,
2502                                  const Vec128<double, N> add) {
2503 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2504   return mul * x + add;
2505 #else
2506   return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
2507 #endif
2508 }
2509 
2510 // Returns add - mul * x
2511 template <size_t N>
2512 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
2513                                    const Vec128<float, N> x,
2514                                    const Vec128<float, N> add) {
2515 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2516   return add - mul * x;
2517 #else
2518   return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2519 #endif
2520 }
2521 template <size_t N>
2522 HWY_API Vec128<double, N> NegMulAdd(const Vec128<double, N> mul,
2523                                     const Vec128<double, N> x,
2524                                     const Vec128<double, N> add) {
2525 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2526   return add - mul * x;
2527 #else
2528   return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
2529 #endif
2530 }
2531 
2532 // Returns mul * x - sub
2533 template <size_t N>
2534 HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
2535                                 const Vec128<float, N> x,
2536                                 const Vec128<float, N> sub) {
2537 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2538   return mul * x - sub;
2539 #else
2540   return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2541 #endif
2542 }
2543 template <size_t N>
2544 HWY_API Vec128<double, N> MulSub(const Vec128<double, N> mul,
2545                                  const Vec128<double, N> x,
2546                                  const Vec128<double, N> sub) {
2547 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2548   return mul * x - sub;
2549 #else
2550   return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
2551 #endif
2552 }
2553 
2554 // Returns -mul * x - sub
2555 template <size_t N>
2556 HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
2557                                    const Vec128<float, N> x,
2558                                    const Vec128<float, N> sub) {
2559 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2560   return Neg(mul) * x - sub;
2561 #else
2562   return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2563 #endif
2564 }
2565 template <size_t N>
2566 HWY_API Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
2567                                     const Vec128<double, N> x,
2568                                     const Vec128<double, N> sub) {
2569 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2570   return Neg(mul) * x - sub;
2571 #else
2572   return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
2573 #endif
2574 }
2575 
2576 // ------------------------------ Floating-point square root
2577 
2578 // Full precision square root
2579 template <size_t N>
2580 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
2581   return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
2582 }
2583 HWY_API Vec128<float, 1> Sqrt(const Vec128<float, 1> v) {
2584   return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
2585 }
2586 template <size_t N>
2587 HWY_API Vec128<double, N> Sqrt(const Vec128<double, N> v) {
2588   return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
2589 }
2590 HWY_API Vec128<double, 1> Sqrt(const Vec128<double, 1> v) {
2591   return Vec128<double, 1>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
2592 }
2593 
2594 // Approximate reciprocal square root
2595 template <size_t N>
2596 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
2597   return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
2598 }
2599 HWY_API Vec128<float, 1> ApproximateReciprocalSqrt(const Vec128<float, 1> v) {
2600   return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)};
2601 }
2602 
2603 // ------------------------------ Min (Gt, IfThenElse)
2604 
2605 namespace detail {
2606 
2607 template <typename T, size_t N>
2608 HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MinU(const Vec128<T, N> a,
2609                                               const Vec128<T, N> b) {
2610   const Simd<T, N> du;
2611   const RebindToSigned<decltype(du)> di;
2612   const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
2613   const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
2614   return IfThenElse(gt, b, a);
2615 }
2616 
2617 }  // namespace detail
2618 
2619 // Unsigned
2620 template <size_t N>
2621 HWY_API Vec128<uint8_t, N> Min(const Vec128<uint8_t, N> a,
2622                                const Vec128<uint8_t, N> b) {
2623   return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
2624 }
2625 template <size_t N>
2626 HWY_API Vec128<uint16_t, N> Min(const Vec128<uint16_t, N> a,
2627                                 const Vec128<uint16_t, N> b) {
2628 #if HWY_TARGET == HWY_SSSE3
2629   return detail::MinU(a, b);
2630 #else
2631   return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
2632 #endif
2633 }
2634 template <size_t N>
2635 HWY_API Vec128<uint32_t, N> Min(const Vec128<uint32_t, N> a,
2636                                 const Vec128<uint32_t, N> b) {
2637 #if HWY_TARGET == HWY_SSSE3
2638   return detail::MinU(a, b);
2639 #else
2640   return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
2641 #endif
2642 }
2643 template <size_t N>
2644 HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
2645                                 const Vec128<uint64_t, N> b) {
2646 #if HWY_TARGET <= HWY_AVX3
2647   return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
2648 #else
2649   return detail::MinU(a, b);
2650 #endif
2651 }
2652 
2653 // Signed
2654 template <size_t N>
2655 HWY_API Vec128<int8_t, N> Min(const Vec128<int8_t, N> a,
2656                               const Vec128<int8_t, N> b) {
2657 #if HWY_TARGET == HWY_SSSE3
2658   return IfThenElse(a < b, a, b);
2659 #else
2660   return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
2661 #endif
2662 }
2663 template <size_t N>
2664 HWY_API Vec128<int16_t, N> Min(const Vec128<int16_t, N> a,
2665                                const Vec128<int16_t, N> b) {
2666   return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
2667 }
2668 template <size_t N>
2669 HWY_API Vec128<int32_t, N> Min(const Vec128<int32_t, N> a,
2670                                const Vec128<int32_t, N> b) {
2671 #if HWY_TARGET == HWY_SSSE3
2672   return IfThenElse(a < b, a, b);
2673 #else
2674   return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
2675 #endif
2676 }
2677 template <size_t N>
2678 HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
2679                                const Vec128<int64_t, N> b) {
2680 #if HWY_TARGET <= HWY_AVX3
2681   return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
2682 #else
2683   return IfThenElse(a < b, a, b);
2684 #endif
2685 }
2686 
2687 // Float
2688 template <size_t N>
2689 HWY_API Vec128<float, N> Min(const Vec128<float, N> a,
2690                              const Vec128<float, N> b) {
2691   return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
2692 }
2693 template <size_t N>
2694 HWY_API Vec128<double, N> Min(const Vec128<double, N> a,
2695                               const Vec128<double, N> b) {
2696   return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
2697 }
2698 
2699 // ------------------------------ Max (Gt, IfThenElse)
2700 
2701 namespace detail {
2702 template <typename T, size_t N>
2703 HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MaxU(const Vec128<T, N> a,
2704                                               const Vec128<T, N> b) {
2705   const Simd<T, N> du;
2706   const RebindToSigned<decltype(du)> di;
2707   const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
2708   const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
2709   return IfThenElse(gt, a, b);
2710 }
2711 
2712 }  // namespace detail
2713 
2714 // Unsigned
2715 template <size_t N>
2716 HWY_API Vec128<uint8_t, N> Max(const Vec128<uint8_t, N> a,
2717                                const Vec128<uint8_t, N> b) {
2718   return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
2719 }
2720 template <size_t N>
2721 HWY_API Vec128<uint16_t, N> Max(const Vec128<uint16_t, N> a,
2722                                 const Vec128<uint16_t, N> b) {
2723 #if HWY_TARGET == HWY_SSSE3
2724   return detail::MaxU(a, b);
2725 #else
2726   return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
2727 #endif
2728 }
2729 template <size_t N>
2730 HWY_API Vec128<uint32_t, N> Max(const Vec128<uint32_t, N> a,
2731                                 const Vec128<uint32_t, N> b) {
2732 #if HWY_TARGET == HWY_SSSE3
2733   return detail::MaxU(a, b);
2734 #else
2735   return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
2736 #endif
2737 }
2738 template <size_t N>
2739 HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
2740                                 const Vec128<uint64_t, N> b) {
2741 #if HWY_TARGET <= HWY_AVX3
2742   return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
2743 #else
2744   return detail::MaxU(a, b);
2745 #endif
2746 }
2747 
2748 // Signed
2749 template <size_t N>
2750 HWY_API Vec128<int8_t, N> Max(const Vec128<int8_t, N> a,
2751                               const Vec128<int8_t, N> b) {
2752 #if HWY_TARGET == HWY_SSSE3
2753   return IfThenElse(a < b, b, a);
2754 #else
2755   return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
2756 #endif
2757 }
2758 template <size_t N>
2759 HWY_API Vec128<int16_t, N> Max(const Vec128<int16_t, N> a,
2760                                const Vec128<int16_t, N> b) {
2761   return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
2762 }
2763 template <size_t N>
2764 HWY_API Vec128<int32_t, N> Max(const Vec128<int32_t, N> a,
2765                                const Vec128<int32_t, N> b) {
2766 #if HWY_TARGET == HWY_SSSE3
2767   return IfThenElse(a < b, b, a);
2768 #else
2769   return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
2770 #endif
2771 }
2772 template <size_t N>
2773 HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
2774                                const Vec128<int64_t, N> b) {
2775 #if HWY_TARGET <= HWY_AVX3
2776   return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
2777 #else
2778   return IfThenElse(a < b, b, a);
2779 #endif
2780 }
2781 
2782 // Float
2783 template <size_t N>
2784 HWY_API Vec128<float, N> Max(const Vec128<float, N> a,
2785                              const Vec128<float, N> b) {
2786   return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
2787 }
2788 template <size_t N>
2789 HWY_API Vec128<double, N> Max(const Vec128<double, N> a,
2790                               const Vec128<double, N> b) {
2791   return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
2792 }
2793 
2794 // ================================================== MEMORY (2)
2795 
2796 // ------------------------------ Non-temporal stores
2797 
2798 // On clang6, we see incorrect code generated for _mm_stream_pi, so
2799 // round even partial vectors up to 16 bytes.
2800 template <typename T, size_t N>
2801 HWY_API void Stream(Vec128<T, N> v, Simd<T, N> /* tag */,
2802                     T* HWY_RESTRICT aligned) {
2803   _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
2804 }
2805 template <size_t N>
2806 HWY_API void Stream(const Vec128<float, N> v, Simd<float, N> /* tag */,
2807                     float* HWY_RESTRICT aligned) {
2808   _mm_stream_ps(aligned, v.raw);
2809 }
2810 template <size_t N>
2811 HWY_API void Stream(const Vec128<double, N> v, Simd<double, N> /* tag */,
2812                     double* HWY_RESTRICT aligned) {
2813   _mm_stream_pd(aligned, v.raw);
2814 }
2815 
2816 // ------------------------------ Scatter
2817 
2818 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
2819 HWY_DIAGNOSTICS(push)
2820 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
2821 
2822 // Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
2823 using GatherIndex64 = long long int;  // NOLINT(google-runtime-int)
2824 static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
2825 
2826 #if HWY_TARGET <= HWY_AVX3
2827 namespace detail {
2828 
2829 template <typename T, size_t N>
2830 HWY_INLINE void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
2831                               Simd<T, N> /* tag */, T* HWY_RESTRICT base,
2832                               const Vec128<int32_t, N> offset) {
2833   if (N == 4) {
2834     _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
2835   } else {
2836     const __mmask8 mask = (1u << N) - 1;
2837     _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
2838   }
2839 }
2840 template <typename T, size_t N>
2841 HWY_INLINE void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
2842                              Simd<T, N> /* tag */, T* HWY_RESTRICT base,
2843                              const Vec128<int32_t, N> index) {
2844   if (N == 4) {
2845     _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
2846   } else {
2847     const __mmask8 mask = (1u << N) - 1;
2848     _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
2849   }
2850 }
2851 
2852 template <typename T, size_t N>
2853 HWY_INLINE void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
2854                               Simd<T, N> /* tag */, T* HWY_RESTRICT base,
2855                               const Vec128<int64_t, N> offset) {
2856   if (N == 2) {
2857     _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
2858   } else {
2859     const __mmask8 mask = (1u << N) - 1;
2860     _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
2861   }
2862 }
2863 template <typename T, size_t N>
2864 HWY_INLINE void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
2865                              Simd<T, N> /* tag */, T* HWY_RESTRICT base,
2866                              const Vec128<int64_t, N> index) {
2867   if (N == 2) {
2868     _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
2869   } else {
2870     const __mmask8 mask = (1u << N) - 1;
2871     _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
2872   }
2873 }
2874 
2875 }  // namespace detail
2876 
2877 template <typename T, size_t N, typename Offset>
2878 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
2879                            const Vec128<Offset, N> offset) {
2880   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2881   return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
2882 }
2883 template <typename T, size_t N, typename Index>
2884 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
2885                           const Vec128<Index, N> index) {
2886   static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2887   return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
2888 }
2889 
2890 template <size_t N>
2891 HWY_API void ScatterOffset(Vec128<float, N> v, Simd<float, N> /* tag */,
2892                            float* HWY_RESTRICT base,
2893                            const Vec128<int32_t, N> offset) {
2894   if (N == 4) {
2895     _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
2896   } else {
2897     const __mmask8 mask = (1u << N) - 1;
2898     _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
2899   }
2900 }
2901 template <size_t N>
2902 HWY_API void ScatterIndex(Vec128<float, N> v, Simd<float, N> /* tag */,
2903                           float* HWY_RESTRICT base,
2904                           const Vec128<int32_t, N> index) {
2905   if (N == 4) {
2906     _mm_i32scatter_ps(base, index.raw, v.raw, 4);
2907   } else {
2908     const __mmask8 mask = (1u << N) - 1;
2909     _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
2910   }
2911 }
2912 
2913 template <size_t N>
2914 HWY_API void ScatterOffset(Vec128<double, N> v, Simd<double, N> /* tag */,
2915                            double* HWY_RESTRICT base,
2916                            const Vec128<int64_t, N> offset) {
2917   if (N == 2) {
2918     _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
2919   } else {
2920     const __mmask8 mask = (1u << N) - 1;
2921     _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
2922   }
2923 }
2924 template <size_t N>
2925 HWY_API void ScatterIndex(Vec128<double, N> v, Simd<double, N> /* tag */,
2926                           double* HWY_RESTRICT base,
2927                           const Vec128<int64_t, N> index) {
2928   if (N == 2) {
2929     _mm_i64scatter_pd(base, index.raw, v.raw, 8);
2930   } else {
2931     const __mmask8 mask = (1u << N) - 1;
2932     _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
2933   }
2934 }
2935 #else  // HWY_TARGET <= HWY_AVX3
2936 
2937 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
2938 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
2939                            const Vec128<Offset, N> offset) {
2940   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2941 
2942   alignas(16) T lanes[N];
2943   Store(v, d, lanes);
2944 
2945   alignas(16) Offset offset_lanes[N];
2946   Store(offset, Simd<Offset, N>(), offset_lanes);
2947 
2948   uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
2949   for (size_t i = 0; i < N; ++i) {
2950     CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2951   }
2952 }
2953 
2954 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
2955 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N> d, T* HWY_RESTRICT base,
2956                           const Vec128<Index, N> index) {
2957   static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2958 
2959   alignas(16) T lanes[N];
2960   Store(v, d, lanes);
2961 
2962   alignas(16) Index index_lanes[N];
2963   Store(index, Simd<Index, N>(), index_lanes);
2964 
2965   for (size_t i = 0; i < N; ++i) {
2966     base[index_lanes[i]] = lanes[i];
2967   }
2968 }
2969 
2970 #endif
2971 
2972 // ------------------------------ Gather (Load/Store)
2973 
2974 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2975 
2976 template <typename T, size_t N, typename Offset>
2977 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N> d,
2978                                   const T* HWY_RESTRICT base,
2979                                   const Vec128<Offset, N> offset) {
2980   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
2981 
2982   alignas(16) Offset offset_lanes[N];
2983   Store(offset, Simd<Offset, N>(), offset_lanes);
2984 
2985   alignas(16) T lanes[N];
2986   const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
2987   for (size_t i = 0; i < N; ++i) {
2988     CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
2989   }
2990   return Load(d, lanes);
2991 }
2992 
2993 template <typename T, size_t N, typename Index>
2994 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N> d, const T* HWY_RESTRICT base,
2995                                  const Vec128<Index, N> index) {
2996   static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
2997 
2998   alignas(16) Index index_lanes[N];
2999   Store(index, Simd<Index, N>(), index_lanes);
3000 
3001   alignas(16) T lanes[N];
3002   for (size_t i = 0; i < N; ++i) {
3003     lanes[i] = base[index_lanes[i]];
3004   }
3005   return Load(d, lanes);
3006 }
3007 
3008 #else
3009 
3010 namespace detail {
3011 
3012 template <typename T, size_t N>
3013 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<4> /* tag */,
3014                                      Simd<T, N> /* d */,
3015                                      const T* HWY_RESTRICT base,
3016                                      const Vec128<int32_t, N> offset) {
3017   return Vec128<T, N>{_mm_i32gather_epi32(
3018       reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
3019 }
3020 template <typename T, size_t N>
3021 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<4> /* tag */,
3022                                     Simd<T, N> /* d */,
3023                                     const T* HWY_RESTRICT base,
3024                                     const Vec128<int32_t, N> index) {
3025   return Vec128<T, N>{_mm_i32gather_epi32(
3026       reinterpret_cast<const int32_t*>(base), index.raw, 4)};
3027 }
3028 
3029 template <typename T, size_t N>
3030 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<8> /* tag */,
3031                                      Simd<T, N> /* d */,
3032                                      const T* HWY_RESTRICT base,
3033                                      const Vec128<int64_t, N> offset) {
3034   return Vec128<T, N>{_mm_i64gather_epi64(
3035       reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
3036 }
3037 template <typename T, size_t N>
3038 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<8> /* tag */,
3039                                     Simd<T, N> /* d */,
3040                                     const T* HWY_RESTRICT base,
3041                                     const Vec128<int64_t, N> index) {
3042   return Vec128<T, N>{_mm_i64gather_epi64(
3043       reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
3044 }
3045 
3046 }  // namespace detail
3047 
3048 template <typename T, size_t N, typename Offset>
3049 HWY_API Vec128<T, N> GatherOffset(Simd<T, N> d, const T* HWY_RESTRICT base,
3050                                   const Vec128<Offset, N> offset) {
3051   return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
3052 }
3053 template <typename T, size_t N, typename Index>
3054 HWY_API Vec128<T, N> GatherIndex(Simd<T, N> d, const T* HWY_RESTRICT base,
3055                                  const Vec128<Index, N> index) {
3056   return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
3057 }
3058 
3059 template <size_t N>
3060 HWY_API Vec128<float, N> GatherOffset(Simd<float, N> /* tag */,
3061                                       const float* HWY_RESTRICT base,
3062                                       const Vec128<int32_t, N> offset) {
3063   return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
3064 }
3065 template <size_t N>
3066 HWY_API Vec128<float, N> GatherIndex(Simd<float, N> /* tag */,
3067                                      const float* HWY_RESTRICT base,
3068                                      const Vec128<int32_t, N> index) {
3069   return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
3070 }
3071 
3072 template <size_t N>
3073 HWY_API Vec128<double, N> GatherOffset(Simd<double, N> /* tag */,
3074                                        const double* HWY_RESTRICT base,
3075                                        const Vec128<int64_t, N> offset) {
3076   return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
3077 }
3078 template <size_t N>
3079 HWY_API Vec128<double, N> GatherIndex(Simd<double, N> /* tag */,
3080                                       const double* HWY_RESTRICT base,
3081                                       const Vec128<int64_t, N> index) {
3082   return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
3083 }
3084 
3085 #endif  // HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3086 
3087 HWY_DIAGNOSTICS(pop)
3088 
3089 // ================================================== SWIZZLE (2)
3090 
3091 // ------------------------------ LowerHalf
3092 
3093 // Returns upper/lower half of a vector.
3094 template <typename T, size_t N>
3095 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2> /* tag */, Vec128<T, N> v) {
3096   return Vec128<T, N / 2>{v.raw};
3097 }
3098 
3099 template <typename T, size_t N>
3100 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
3101   return LowerHalf(Simd<T, N / 2>(), v);
3102 }
3103 
3104 // ------------------------------ ShiftLeftBytes
3105 
3106 template <int kBytes, typename T, size_t N>
3107 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N> /* tag */, Vec128<T, N> v) {
3108   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3109   return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
3110 }
3111 
3112 template <int kBytes, typename T, size_t N>
3113 HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
3114   return ShiftLeftBytes<kBytes>(Simd<T, N>(), v);
3115 }
3116 
3117 // ------------------------------ ShiftLeftLanes
3118 
3119 template <int kLanes, typename T, size_t N>
3120 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N> d, const Vec128<T, N> v) {
3121   const Repartition<uint8_t, decltype(d)> d8;
3122   return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3123 }
3124 
3125 template <int kLanes, typename T, size_t N>
3126 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
3127   return ShiftLeftLanes<kLanes>(Simd<T, N>(), v);
3128 }
3129 
3130 // ------------------------------ ShiftRightBytes
3131 template <int kBytes, typename T, size_t N>
3132 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N> /* tag */, Vec128<T, N> v) {
3133   static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3134   // For partial vectors, clear upper lanes so we shift in zeros.
3135   if (N != 16 / sizeof(T)) {
3136     const Vec128<T> vfull{v.raw};
3137     v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
3138   }
3139   return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
3140 }
3141 
3142 // ------------------------------ ShiftRightLanes
3143 template <int kLanes, typename T, size_t N>
3144 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N> d, const Vec128<T, N> v) {
3145   const Repartition<uint8_t, decltype(d)> d8;
3146   return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3147 }
3148 
3149 // ------------------------------ UpperHalf (ShiftRightBytes)
3150 
3151 // Full input: copy hi into lo (smaller instruction encoding than shifts).
3152 template <typename T>
3153 HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Half<Full128<T>> /* tag */,
3154                                            Vec128<T> v) {
3155   return Vec128<T, 8 / sizeof(T)>{_mm_unpackhi_epi64(v.raw, v.raw)};
3156 }
3157 HWY_API Vec128<float, 2> UpperHalf(Simd<float, 2> /* tag */, Vec128<float> v) {
3158   return Vec128<float, 2>{_mm_movehl_ps(v.raw, v.raw)};
3159 }
3160 HWY_API Vec128<double, 1> UpperHalf(Simd<double, 1> /* tag */,
3161                                     Vec128<double> v) {
3162   return Vec128<double, 1>{_mm_unpackhi_pd(v.raw, v.raw)};
3163 }
3164 
3165 // Partial
3166 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3167 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N>> /* tag */,
3168                                          Vec128<T, N> v) {
3169   const Simd<T, N> d;
3170   const auto vu = BitCast(RebindToUnsigned<decltype(d)>(), v);
3171   const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(vu));
3172   return Vec128<T, (N + 1) / 2>{upper.raw};
3173 }
3174 
3175 // ------------------------------ CombineShiftRightBytes
3176 
3177 template <int kBytes, typename T, class V = Vec128<T>>
3178 HWY_API V CombineShiftRightBytes(Full128<T> d, V hi, V lo) {
3179   const Repartition<uint8_t, decltype(d)> d8;
3180   return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
3181                         BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
3182 }
3183 
3184 template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
3185           class V = Vec128<T, N>>
3186 HWY_API V CombineShiftRightBytes(Simd<T, N> d, V hi, V lo) {
3187   constexpr size_t kSize = N * sizeof(T);
3188   static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3189   const Repartition<uint8_t, decltype(d)> d8;
3190   const Full128<uint8_t> d_full8;
3191   using V8 = VFromD<decltype(d_full8)>;
3192   const V8 hi8{BitCast(d8, hi).raw};
3193   // Move into most-significant bytes
3194   const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
3195   const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
3196   return V{BitCast(Full128<T>(), r).raw};
3197 }
3198 
3199 // ------------------------------ Broadcast/splat any lane
3200 
3201 // Unsigned
3202 template <int kLane, size_t N>
3203 HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
3204   static_assert(0 <= kLane && kLane < N, "Invalid lane");
3205   if (kLane < 4) {
3206     const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
3207     return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)};
3208   } else {
3209     const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
3210     return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)};
3211   }
3212 }
3213 template <int kLane, size_t N>
3214 HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
3215   static_assert(0 <= kLane && kLane < N, "Invalid lane");
3216   return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
3217 }
3218 template <int kLane, size_t N>
3219 HWY_API Vec128<uint64_t, N> Broadcast(const Vec128<uint64_t, N> v) {
3220   static_assert(0 <= kLane && kLane < N, "Invalid lane");
3221   return Vec128<uint64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
3222 }
3223 
3224 // Signed
3225 template <int kLane, size_t N>
3226 HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
3227   static_assert(0 <= kLane && kLane < N, "Invalid lane");
3228   if (kLane < 4) {
3229     const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
3230     return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)};
3231   } else {
3232     const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
3233     return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)};
3234   }
3235 }
3236 template <int kLane, size_t N>
3237 HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
3238   static_assert(0 <= kLane && kLane < N, "Invalid lane");
3239   return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
3240 }
3241 template <int kLane, size_t N>
3242 HWY_API Vec128<int64_t, N> Broadcast(const Vec128<int64_t, N> v) {
3243   static_assert(0 <= kLane && kLane < N, "Invalid lane");
3244   return Vec128<int64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
3245 }
3246 
3247 // Float
3248 template <int kLane, size_t N>
3249 HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
3250   static_assert(0 <= kLane && kLane < N, "Invalid lane");
3251   return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
3252 }
3253 template <int kLane, size_t N>
3254 HWY_API Vec128<double, N> Broadcast(const Vec128<double, N> v) {
3255   static_assert(0 <= kLane && kLane < N, "Invalid lane");
3256   return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
3257 }
3258 
3259 // ------------------------------ TableLookupBytes
3260 template <typename T, size_t N, typename TI, size_t NI>
3261 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
3262                                         const Vec128<TI, NI> from) {
3263   return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
3264 }
3265 
3266 // ------------------------------ TableLookupBytesOr0
3267 // For all vector widths; x86 anyway zeroes if >= 0x80.
3268 template <class V, class VI>
3269 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
3270   return TableLookupBytes(bytes, from);
3271 }
3272 
3273 // ------------------------------ TableLookupLanes (Shuffle01)
3274 
3275 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
3276 template <typename T, size_t N = 16 / sizeof(T)>
3277 struct Indices128 {
3278   __m128i raw;
3279 };
3280 
3281 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
3282           HWY_IF_LANE_SIZE(T, 4)>
3283 HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N> d, Vec128<TI, N> vec) {
3284   static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3285 #if HWY_IS_DEBUG_BUILD
3286   const Simd<TI, N> di;
3287   HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
3288               AllTrue(di, Lt(vec, Set(di, N))));
3289 #endif
3290 
3291 #if HWY_TARGET <= HWY_AVX2
3292   (void)d;
3293   return Indices128<T, N>{vec.raw};
3294 #else
3295   const Repartition<uint8_t, decltype(d)> d8;
3296   using V8 = VFromD<decltype(d8)>;
3297   alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3298                                                     0, 1, 2, 3, 0, 1, 2, 3};
3299 
3300   // Broadcast each lane index to all 4 bytes of T
3301   alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3302       0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3303   const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
3304 
3305   // Shift to bytes
3306   const Repartition<uint16_t, decltype(d)> d16;
3307   const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
3308 
3309   return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
3310 #endif
3311 }
3312 
3313 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
3314           HWY_IF_LANE_SIZE(T, 8)>
3315 HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N> /* tag */,
3316                                         Vec128<TI, N> vec) {
3317   static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3318 #if HWY_IS_DEBUG_BUILD
3319   const Simd<TI, N> di;
3320   HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
3321               AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
3322 #endif
3323 
3324   // No change - even without AVX3, we can shuffle+blend.
3325   return Indices128<T, N>{vec.raw};
3326 }
3327 
3328 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
3329 HWY_API Indices128<T, N> SetTableIndices(Simd<T, N> d, const TI* idx) {
3330   const Rebind<TI, decltype(d)> di;
3331   return IndicesFromVec(d, LoadU(di, idx));
3332 }
3333 
3334 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3335 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
3336 #if HWY_TARGET <= HWY_AVX2
3337   const Simd<T, N> d;
3338   const Simd<float, N> df;
3339   const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)};
3340   return BitCast(d, perm);
3341 #else
3342   return TableLookupBytes(v, Vec128<T, N>{idx.raw});
3343 #endif
3344 }
3345 
3346 template <size_t N, HWY_IF_GE64(float, N)>
3347 HWY_API Vec128<float, N> TableLookupLanes(Vec128<float, N> v,
3348                                           Indices128<float, N> idx) {
3349 #if HWY_TARGET <= HWY_AVX2
3350   return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
3351 #else
3352   const Simd<int32_t, N> di;
3353   const Simd<float, N> df;
3354   return BitCast(df,
3355                  TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
3356 #endif
3357 }
3358 
3359 // Single lane: no change
3360 template <typename T>
3361 HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
3362                                       Indices128<T, 1> /* idx */) {
3363   return v;
3364 }
3365 
3366 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3367 HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) {
3368   const Full128<T> d;
3369   Vec128<int64_t> vidx{idx.raw};
3370 #if HWY_TARGET <= HWY_AVX2
3371   // There is no _mm_permute[x]var_epi64.
3372   vidx += vidx;  // bit1 is the decider (unusual)
3373   const Full128<double> df;
3374   return BitCast(
3375       d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)});
3376 #else
3377   // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
3378   // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
3379   // to obtain an all-zero or all-one mask.
3380   const Full128<int64_t> di;
3381   const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
3382   const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same));
3383   return IfThenElse(mask_same, v, Shuffle01(v));
3384 #endif
3385 }
3386 
3387 HWY_API Vec128<double> TableLookupLanes(Vec128<double> v,
3388                                         Indices128<double> idx) {
3389   Vec128<int64_t> vidx{idx.raw};
3390 #if HWY_TARGET <= HWY_AVX2
3391   vidx += vidx;  // bit1 is the decider (unusual)
3392   return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
3393 #else
3394   // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
3395   // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
3396   // to obtain an all-zero or all-one mask.
3397   const Full128<double> d;
3398   const Full128<int64_t> di;
3399   const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
3400   const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
3401   return IfThenElse(mask_same, v, Shuffle01(v));
3402 #endif
3403 }
3404 
3405 // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
3406 
3407 // Single lane: no change
3408 template <typename T>
3409 HWY_API Vec128<T, 1> Reverse(Simd<T, 1> /* tag */, const Vec128<T, 1> v) {
3410   return v;
3411 }
3412 
3413 // Two lanes: shuffle
3414 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3415 HWY_API Vec128<T, 2> Reverse(Simd<T, 2> /* tag */, const Vec128<T, 2> v) {
3416   return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
3417 }
3418 
3419 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3420 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
3421   return Shuffle01(v);
3422 }
3423 
3424 // Four lanes: shuffle
3425 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3426 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
3427   return Shuffle0123(v);
3428 }
3429 
3430 // 16-bit
3431 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3432 HWY_API Vec128<T, N> Reverse(Simd<T, N> d, const Vec128<T, N> v) {
3433 #if HWY_TARGET <= HWY_AVX3
3434   if (N == 1) return v;
3435   if (N == 2) {
3436     const Repartition<uint32_t, decltype(d)> du32;
3437     return BitCast(d, RotateRight<16>(BitCast(du32, v)));
3438   }
3439   const RebindToSigned<decltype(d)> di;
3440   alignas(16) constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
3441   const Vec128<int16_t, N> idx = Load(di, kReverse + (N == 8 ? 0 : 4));
3442   return BitCast(d, Vec128<int16_t, N>{
3443                         _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3444 #else
3445   const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
3446   return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
3447 #endif
3448 }
3449 
3450 // ------------------------------ InterleaveLower
3451 
3452 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
3453 // the least-significant lane) and "b". To concatenate two half-width integers
3454 // into one, use ZipLower/Upper instead (also works with scalar).
3455 
3456 template <size_t N, HWY_IF_LE128(uint8_t, N)>
3457 HWY_API Vec128<uint8_t, N> InterleaveLower(const Vec128<uint8_t, N> a,
3458                                            const Vec128<uint8_t, N> b) {
3459   return Vec128<uint8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
3460 }
3461 template <size_t N, HWY_IF_LE128(uint16_t, N)>
3462 HWY_API Vec128<uint16_t, N> InterleaveLower(const Vec128<uint16_t, N> a,
3463                                             const Vec128<uint16_t, N> b) {
3464   return Vec128<uint16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
3465 }
3466 template <size_t N, HWY_IF_LE128(uint32_t, N)>
3467 HWY_API Vec128<uint32_t, N> InterleaveLower(const Vec128<uint32_t, N> a,
3468                                             const Vec128<uint32_t, N> b) {
3469   return Vec128<uint32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
3470 }
3471 template <size_t N, HWY_IF_LE128(uint64_t, N)>
3472 HWY_API Vec128<uint64_t, N> InterleaveLower(const Vec128<uint64_t, N> a,
3473                                             const Vec128<uint64_t, N> b) {
3474   return Vec128<uint64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
3475 }
3476 
3477 template <size_t N, HWY_IF_LE128(int8_t, N)>
3478 HWY_API Vec128<int8_t, N> InterleaveLower(const Vec128<int8_t, N> a,
3479                                           const Vec128<int8_t, N> b) {
3480   return Vec128<int8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
3481 }
3482 template <size_t N, HWY_IF_LE128(int16_t, N)>
3483 HWY_API Vec128<int16_t, N> InterleaveLower(const Vec128<int16_t, N> a,
3484                                            const Vec128<int16_t, N> b) {
3485   return Vec128<int16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
3486 }
3487 template <size_t N, HWY_IF_LE128(int32_t, N)>
3488 HWY_API Vec128<int32_t, N> InterleaveLower(const Vec128<int32_t, N> a,
3489                                            const Vec128<int32_t, N> b) {
3490   return Vec128<int32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
3491 }
3492 template <size_t N, HWY_IF_LE128(int64_t, N)>
3493 HWY_API Vec128<int64_t, N> InterleaveLower(const Vec128<int64_t, N> a,
3494                                            const Vec128<int64_t, N> b) {
3495   return Vec128<int64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
3496 }
3497 
3498 template <size_t N, HWY_IF_LE128(float, N)>
3499 HWY_API Vec128<float, N> InterleaveLower(const Vec128<float, N> a,
3500                                          const Vec128<float, N> b) {
3501   return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
3502 }
3503 template <size_t N, HWY_IF_LE128(double, N)>
3504 HWY_API Vec128<double, N> InterleaveLower(const Vec128<double, N> a,
3505                                           const Vec128<double, N> b) {
3506   return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
3507 }
3508 
3509 // Additional overload for the optional Simd<> tag.
3510 template <typename T, size_t N, HWY_IF_LE128(T, N), class V = Vec128<T, N>>
3511 HWY_API V InterleaveLower(Simd<T, N> /* tag */, V a, V b) {
3512   return InterleaveLower(a, b);
3513 }
3514 
3515 // ------------------------------ InterleaveUpper (UpperHalf)
3516 
3517 // All functions inside detail lack the required D parameter.
3518 namespace detail {
3519 
3520 HWY_API Vec128<uint8_t> InterleaveUpper(const Vec128<uint8_t> a,
3521                                         const Vec128<uint8_t> b) {
3522   return Vec128<uint8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
3523 }
3524 HWY_API Vec128<uint16_t> InterleaveUpper(const Vec128<uint16_t> a,
3525                                          const Vec128<uint16_t> b) {
3526   return Vec128<uint16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
3527 }
3528 HWY_API Vec128<uint32_t> InterleaveUpper(const Vec128<uint32_t> a,
3529                                          const Vec128<uint32_t> b) {
3530   return Vec128<uint32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
3531 }
3532 HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
3533                                          const Vec128<uint64_t> b) {
3534   return Vec128<uint64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
3535 }
3536 
3537 HWY_API Vec128<int8_t> InterleaveUpper(const Vec128<int8_t> a,
3538                                        const Vec128<int8_t> b) {
3539   return Vec128<int8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
3540 }
3541 HWY_API Vec128<int16_t> InterleaveUpper(const Vec128<int16_t> a,
3542                                         const Vec128<int16_t> b) {
3543   return Vec128<int16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
3544 }
3545 HWY_API Vec128<int32_t> InterleaveUpper(const Vec128<int32_t> a,
3546                                         const Vec128<int32_t> b) {
3547   return Vec128<int32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
3548 }
3549 HWY_API Vec128<int64_t> InterleaveUpper(const Vec128<int64_t> a,
3550                                         const Vec128<int64_t> b) {
3551   return Vec128<int64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
3552 }
3553 
3554 HWY_API Vec128<float> InterleaveUpper(const Vec128<float> a,
3555                                       const Vec128<float> b) {
3556   return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
3557 }
3558 HWY_API Vec128<double> InterleaveUpper(const Vec128<double> a,
3559                                        const Vec128<double> b) {
3560   return Vec128<double>{_mm_unpackhi_pd(a.raw, b.raw)};
3561 }
3562 
3563 }  // namespace detail
3564 
3565 // Full
3566 template <typename T, class V = Vec128<T>>
3567 HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
3568   return detail::InterleaveUpper(a, b);
3569 }
3570 
3571 // Partial
3572 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
3573 HWY_API V InterleaveUpper(Simd<T, N> d, V a, V b) {
3574   const Half<decltype(d)> d2;
3575   return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
3576 }
3577 
3578 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3579 
3580 // Same as Interleave*, except that the return lanes are double-width integers;
3581 // this is necessary because the single-lane scalar cannot return two values.
3582 template <typename T, size_t N, class DW = RepartitionToWide<Simd<T, N>>>
3583 HWY_API VFromD<DW> ZipLower(Vec128<T, N> a, Vec128<T, N> b) {
3584   return BitCast(DW(), InterleaveLower(a, b));
3585 }
3586 template <typename T, size_t N, class D = Simd<T, N>,
3587           class DW = RepartitionToWide<D>>
3588 HWY_API VFromD<DW> ZipLower(DW dw, Vec128<T, N> a, Vec128<T, N> b) {
3589   return BitCast(dw, InterleaveLower(D(), a, b));
3590 }
3591 
3592 template <typename T, size_t N, class D = Simd<T, N>,
3593           class DW = RepartitionToWide<D>>
3594 HWY_API VFromD<DW> ZipUpper(DW dw, Vec128<T, N> a, Vec128<T, N> b) {
3595   return BitCast(dw, InterleaveUpper(D(), a, b));
3596 }
3597 
3598 // ================================================== COMBINE
3599 
3600 // ------------------------------ Combine (InterleaveLower)
3601 
3602 // N = N/2 + N/2 (upper half undefined)
3603 template <typename T, size_t N, HWY_IF_LE128(T, N)>
3604 HWY_API Vec128<T, N> Combine(Simd<T, N> d, Vec128<T, N / 2> hi_half,
3605                              Vec128<T, N / 2> lo_half) {
3606   const Half<decltype(d)> d2;
3607   const RebindToUnsigned<decltype(d2)> du2;
3608   // Treat half-width input as one lane, and expand to two lanes.
3609   using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
3610   const VU lo{BitCast(du2, lo_half).raw};
3611   const VU hi{BitCast(du2, hi_half).raw};
3612   return BitCast(d, InterleaveLower(lo, hi));
3613 }
3614 
3615 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
3616 
3617 template <typename T, HWY_IF_NOT_FLOAT(T)>
3618 HWY_API Vec128<T> ZeroExtendVector(Full128<T> /* tag */,
3619                                    Vec128<T, 8 / sizeof(T)> lo) {
3620   return Vec128<T>{_mm_move_epi64(lo.raw)};
3621 }
3622 
3623 template <typename T, HWY_IF_FLOAT(T)>
3624 HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec128<T, 8 / sizeof(T)> lo) {
3625   const RebindToUnsigned<decltype(d)> du;
3626   return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo)));
3627 }
3628 
3629 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3630 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N> d, Vec128<T, N / 2> lo) {
3631   return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
3632 }
3633 
3634 // ------------------------------ Concat full (InterleaveLower)
3635 
3636 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
3637 template <typename T>
3638 HWY_API Vec128<T> ConcatLowerLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3639   const Repartition<uint64_t, decltype(d)> d64;
3640   return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
3641 }
3642 
3643 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
3644 template <typename T>
3645 HWY_API Vec128<T> ConcatUpperUpper(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3646   const Repartition<uint64_t, decltype(d)> d64;
3647   return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
3648 }
3649 
3650 // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
3651 template <typename T>
3652 HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
3653                                    const Vec128<T> lo) {
3654   return CombineShiftRightBytes<8>(d, hi, lo);
3655 }
3656 
3657 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
3658 template <typename T>
3659 HWY_API Vec128<T> ConcatUpperLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3660 #if HWY_TARGET == HWY_SSSE3
3661   const Full128<double> dd;
3662   const __m128d concat = _mm_move_sd(BitCast(dd, hi).raw, BitCast(dd, lo).raw);
3663   return BitCast(d, Vec128<double>{concat});
3664 #else
3665   (void)d;
3666   return Vec128<T>{_mm_blend_epi16(hi.raw, lo.raw, 0x0F)};
3667 #endif
3668 }
3669 HWY_API Vec128<float> ConcatUpperLower(Full128<float> /* tag */,
3670                                        const Vec128<float> hi,
3671                                        const Vec128<float> lo) {
3672   return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
3673 }
3674 HWY_API Vec128<double> ConcatUpperLower(Full128<double> /* tag */,
3675                                         const Vec128<double> hi,
3676                                         const Vec128<double> lo) {
3677   return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
3678 }
3679 
3680 // ------------------------------ Concat partial (Combine, LowerHalf)
3681 
3682 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3683 HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N> d, Vec128<T, N> hi,
3684                                       Vec128<T, N> lo) {
3685   const Half<decltype(d)> d2;
3686   return Combine(LowerHalf(d2, hi), LowerHalf(d2, lo));
3687 }
3688 
3689 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3690 HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N> d, Vec128<T, N> hi,
3691                                       Vec128<T, N> lo) {
3692   const Half<decltype(d)> d2;
3693   return Combine(UpperHalf(d2, hi), UpperHalf(d2, lo));
3694 }
3695 
3696 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3697 HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N> d, const Vec128<T, N> hi,
3698                                       const Vec128<T, N> lo) {
3699   const Half<decltype(d)> d2;
3700   return Combine(LowerHalf(d2, hi), UpperHalf(d2, lo));
3701 }
3702 
3703 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3704 HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N> d, Vec128<T, N> hi,
3705                                       Vec128<T, N> lo) {
3706   const Half<decltype(d)> d2;
3707   return Combine(UpperHalf(d2, hi), LowerHalf(d2, lo));
3708 }
3709 
3710 // ------------------------------ ConcatOdd
3711 
3712 // 32-bit full
3713 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3714 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3715   const RebindToFloat<decltype(d)> df;
3716   return BitCast(
3717       d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
3718                                       _MM_SHUFFLE(3, 1, 3, 1))});
3719 }
3720 template <size_t N>
3721 HWY_API Vec128<float> ConcatOdd(Full128<float> /* tag */, Vec128<float> hi,
3722                                 Vec128<float> lo) {
3723   return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
3724 }
3725 
3726 // 32-bit partial
3727 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3728 HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2> d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
3729   return InterleaveUpper(d, lo, hi);
3730 }
3731 
3732 // 64-bit full - no partial because we need at least two inputs to have
3733 // even/odd.
3734 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3735 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3736   return InterleaveUpper(d, lo, hi);
3737 }
3738 
3739 // ------------------------------ ConcatEven (InterleaveLower)
3740 
3741 // 32-bit full
3742 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3743 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3744   const RebindToFloat<decltype(d)> df;
3745   return BitCast(
3746       d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
3747                                       _MM_SHUFFLE(2, 0, 2, 0))});
3748 }
3749 template <size_t N>
3750 HWY_API Vec128<float> ConcatEven(Full128<float> /* tag */, Vec128<float> hi,
3751                                  Vec128<float> lo) {
3752   return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
3753 }
3754 
3755 // 32-bit partial
3756 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3757 HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2> d, Vec128<T, 2> hi,
3758                                 Vec128<T, 2> lo) {
3759   return InterleaveLower(d, lo, hi);
3760 }
3761 
3762 // 64-bit full - no partial because we need at least two inputs to have
3763 // even/odd.
3764 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3765 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3766   return InterleaveLower(d, lo, hi);
3767 }
3768 
3769 // ------------------------------ OddEven (IfThenElse)
3770 
3771 namespace detail {
3772 
3773 template <typename T, size_t N>
3774 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
3775                                 const Vec128<T, N> b) {
3776   const Simd<T, N> d;
3777   const Repartition<uint8_t, decltype(d)> d8;
3778   alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3779                                             0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3780   return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
3781 }
3782 template <typename T, size_t N>
3783 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
3784                                 const Vec128<T, N> b) {
3785 #if HWY_TARGET == HWY_SSSE3
3786   const Simd<T, N> d;
3787   const Repartition<uint8_t, decltype(d)> d8;
3788   alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
3789                                             0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
3790   return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
3791 #else
3792   return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
3793 #endif
3794 }
3795 template <typename T, size_t N>
3796 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
3797                                 const Vec128<T, N> b) {
3798 #if HWY_TARGET == HWY_SSSE3
3799   const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
3800   const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
3801   return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
3802 #else
3803   return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
3804 #endif
3805 }
3806 template <typename T, size_t N>
3807 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
3808                                 const Vec128<T, N> b) {
3809 #if HWY_TARGET == HWY_SSSE3
3810   const Full128<double> dd;
3811   const __m128d concat = _mm_move_sd(BitCast(dd, a).raw, BitCast(dd, b).raw);
3812   return BitCast(Full128<T>(), Vec128<double>{concat});
3813 #else
3814   return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
3815 #endif
3816 }
3817 
3818 }  // namespace detail
3819 
3820 template <typename T, size_t N>
3821 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
3822   return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
3823 }
3824 template <size_t N>
3825 HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
3826                                  const Vec128<float, N> b) {
3827 #if HWY_TARGET == HWY_SSSE3
3828   // SHUFPS must fill the lower half of the output from one register, so we
3829   // need another shuffle. Unpack avoids another immediate byte.
3830   const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
3831   const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
3832   return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
3833 #else
3834   return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
3835 #endif
3836 }
3837 
3838 template <size_t N>
3839 HWY_API Vec128<double, N> OddEven(const Vec128<double, N> a,
3840                                   const Vec128<double, N> b) {
3841   return Vec128<double>{_mm_shuffle_pd(b.raw, a.raw, _MM_SHUFFLE2(1, 0))};
3842 }
3843 
3844 // ------------------------------ OddEvenBlocks
3845 template <typename T, size_t N>
3846 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
3847   return even;
3848 }
3849 
3850 // ------------------------------ SwapAdjacentBlocks
3851 
3852 template <typename T, size_t N>
3853 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
3854   return v;
3855 }
3856 
3857 // ------------------------------ Shl (ZipLower, Mul)
3858 
3859 // Use AVX2/3 variable shifts where available, otherwise multiply by powers of
3860 // two from loading float exponents, which is considerably faster (according
3861 // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
3862 
3863 #if HWY_TARGET > HWY_AVX3  // AVX2 or older
3864 namespace detail {
3865 
3866 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
3867 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3868 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
3869   const Simd<T, N> d;
3870   const RepartitionToWide<decltype(d)> dw;
3871   const Rebind<float, decltype(dw)> df;
3872   const auto zero = Zero(d);
3873   // Move into exponent (this u16 will become the upper half of an f32)
3874   const auto exp = ShiftLeft<23 - 16>(v);
3875   const auto upper = exp + Set(d, 0x3F80);  // upper half of 1.0f
3876   // Insert 0 into lower halves for reinterpreting as binary32.
3877   const auto f0 = ZipLower(dw, zero, upper);
3878   const auto f1 = ZipUpper(dw, zero, upper);
3879   // See comment below.
3880   const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
3881   const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
3882   return Vec128<MakeUnsigned<T>, N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
3883 }
3884 
3885 // Same, for 32-bit shifts.
3886 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3887 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
3888   const Simd<T, N> d;
3889   const auto exp = ShiftLeft<23>(v);
3890   const auto f = exp + Set(d, 0x3F800000);  // 1.0f
3891   // Do not use ConvertTo because we rely on the native 0x80..00 overflow
3892   // behavior. cvt instead of cvtt should be equivalent, but avoids test
3893   // failure under GCC 10.2.1.
3894   return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
3895 }
3896 
3897 }  // namespace detail
3898 #endif  // HWY_TARGET > HWY_AVX3
3899 
3900 template <size_t N>
3901 HWY_API Vec128<uint16_t, N> operator<<(const Vec128<uint16_t, N> v,
3902                                        const Vec128<uint16_t, N> bits) {
3903 #if HWY_TARGET <= HWY_AVX3
3904   return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
3905 #else
3906   return v * detail::Pow2(bits);
3907 #endif
3908 }
3909 HWY_API Vec128<uint16_t, 1> operator<<(const Vec128<uint16_t, 1> v,
3910                                        const Vec128<uint16_t, 1> bits) {
3911   return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)};
3912 }
3913 
3914 template <size_t N>
3915 HWY_API Vec128<uint32_t, N> operator<<(const Vec128<uint32_t, N> v,
3916                                        const Vec128<uint32_t, N> bits) {
3917 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3918   return v * detail::Pow2(bits);
3919 #else
3920   return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
3921 #endif
3922 }
3923 HWY_API Vec128<uint32_t, 1> operator<<(const Vec128<uint32_t, 1> v,
3924                                        const Vec128<uint32_t, 1> bits) {
3925   return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)};
3926 }
3927 
3928 HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
3929                                     const Vec128<uint64_t> bits) {
3930 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3931   // Individual shifts and combine
3932   const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
3933   const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
3934   const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
3935   return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
3936 #else
3937   return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
3938 #endif
3939 }
3940 HWY_API Vec128<uint64_t, 1> operator<<(const Vec128<uint64_t, 1> v,
3941                                        const Vec128<uint64_t, 1> bits) {
3942   return Vec128<uint64_t, 1>{_mm_sll_epi64(v.raw, bits.raw)};
3943 }
3944 
3945 // Signed left shift is the same as unsigned.
3946 template <typename T, size_t N, HWY_IF_SIGNED(T)>
3947 HWY_API Vec128<T, N> operator<<(const Vec128<T, N> v, const Vec128<T, N> bits) {
3948   const Simd<T, N> di;
3949   const Simd<MakeUnsigned<T>, N> du;
3950   return BitCast(di, BitCast(du, v) << BitCast(du, bits));
3951 }
3952 
3953 // ------------------------------ Shr (mul, mask, BroadcastSignBit)
3954 
3955 // Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use
3956 // widening multiplication by powers of two obtained by loading float exponents,
3957 // followed by a constant right-shift. This is still faster than a scalar or
3958 // bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.
3959 
3960 template <size_t N>
3961 HWY_API Vec128<uint16_t, N> operator>>(const Vec128<uint16_t, N> in,
3962                                        const Vec128<uint16_t, N> bits) {
3963 #if HWY_TARGET <= HWY_AVX3
3964   return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
3965 #else
3966   const Simd<uint16_t, N> d;
3967   // For bits=0, we cannot mul by 2^16, so fix the result later.
3968   const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
3969   // Replace output with input where bits == 0.
3970   return IfThenElse(bits == Zero(d), in, out);
3971 #endif
3972 }
3973 HWY_API Vec128<uint16_t, 1> operator>>(const Vec128<uint16_t, 1> in,
3974                                        const Vec128<uint16_t, 1> bits) {
3975   return Vec128<uint16_t, 1>{_mm_srl_epi16(in.raw, bits.raw)};
3976 }
3977 
3978 template <size_t N>
3979 HWY_API Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> in,
3980                                        const Vec128<uint32_t, N> bits) {
3981 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3982   // 32x32 -> 64 bit mul, then shift right by 32.
3983   const Simd<uint32_t, N> d32;
3984   // Move odd lanes into position for the second mul. Shuffle more gracefully
3985   // handles N=1 than repartitioning to u64 and shifting 32 bits right.
3986   const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
3987   // For bits=0, we cannot mul by 2^32, so fix the result later.
3988   const auto mul = detail::Pow2(Set(d32, 32) - bits);
3989   const auto out20 = ShiftRight<32>(MulEven(in, mul));  // z 2 z 0
3990   const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
3991   // No need to shift right, already in the correct position.
3992   const auto out31 = BitCast(d32, MulEven(in31, mul31));  // 3 ? 1 ?
3993   const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
3994   // Replace output with input where bits == 0.
3995   return IfThenElse(bits == Zero(d32), in, out);
3996 #else
3997   return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
3998 #endif
3999 }
4000 HWY_API Vec128<uint32_t, 1> operator>>(const Vec128<uint32_t, 1> in,
4001                                        const Vec128<uint32_t, 1> bits) {
4002   return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits.raw)};
4003 }
4004 
4005 HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
4006                                     const Vec128<uint64_t> bits) {
4007 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4008   // Individual shifts and combine
4009   const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
4010   const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
4011   const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
4012   return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
4013 #else
4014   return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
4015 #endif
4016 }
4017 HWY_API Vec128<uint64_t, 1> operator>>(const Vec128<uint64_t, 1> v,
4018                                        const Vec128<uint64_t, 1> bits) {
4019   return Vec128<uint64_t, 1>{_mm_srl_epi64(v.raw, bits.raw)};
4020 }
4021 
4022 #if HWY_TARGET > HWY_AVX3  // AVX2 or older
4023 namespace detail {
4024 
4025 // Also used in x86_256-inl.h.
4026 template <class DI, class V>
4027 HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
4028   const RebindToUnsigned<DI> du;
4029   const auto count = BitCast(du, count_i);  // same type as value to shift
4030   // Clear sign and restore afterwards. This is preferable to shifting the MSB
4031   // downwards because Shr is somewhat more expensive than Shl.
4032   const auto sign = BroadcastSignBit(v);
4033   const auto abs = BitCast(du, v ^ sign);  // off by one, but fixed below
4034   return BitCast(di, abs >> count) ^ sign;
4035 }
4036 
4037 }  // namespace detail
4038 #endif  // HWY_TARGET > HWY_AVX3
4039 
4040 template <size_t N>
4041 HWY_API Vec128<int16_t, N> operator>>(const Vec128<int16_t, N> v,
4042                                       const Vec128<int16_t, N> bits) {
4043 #if HWY_TARGET <= HWY_AVX3
4044   return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
4045 #else
4046   return detail::SignedShr(Simd<int16_t, N>(), v, bits);
4047 #endif
4048 }
4049 HWY_API Vec128<int16_t, 1> operator>>(const Vec128<int16_t, 1> v,
4050                                       const Vec128<int16_t, 1> bits) {
4051   return Vec128<int16_t, 1>{_mm_sra_epi16(v.raw, bits.raw)};
4052 }
4053 
4054 template <size_t N>
4055 HWY_API Vec128<int32_t, N> operator>>(const Vec128<int32_t, N> v,
4056                                       const Vec128<int32_t, N> bits) {
4057 #if HWY_TARGET <= HWY_AVX3
4058   return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
4059 #else
4060   return detail::SignedShr(Simd<int32_t, N>(), v, bits);
4061 #endif
4062 }
4063 HWY_API Vec128<int32_t, 1> operator>>(const Vec128<int32_t, 1> v,
4064                                       const Vec128<int32_t, 1> bits) {
4065   return Vec128<int32_t, 1>{_mm_sra_epi32(v.raw, bits.raw)};
4066 }
4067 
4068 template <size_t N>
4069 HWY_API Vec128<int64_t, N> operator>>(const Vec128<int64_t, N> v,
4070                                       const Vec128<int64_t, N> bits) {
4071 #if HWY_TARGET <= HWY_AVX3
4072   return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
4073 #else
4074   return detail::SignedShr(Simd<int64_t, N>(), v, bits);
4075 #endif
4076 }
4077 
4078 // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
4079 
4080 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
4081                                     const Vec128<uint64_t> b) {
4082   alignas(16) uint64_t mul[2];
4083   mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
4084   return Load(Full128<uint64_t>(), mul);
4085 }
4086 
4087 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
4088                                    const Vec128<uint64_t> b) {
4089   alignas(16) uint64_t mul[2];
4090   const Half<Full128<uint64_t>> d2;
4091   mul[0] =
4092       Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
4093   return Load(Full128<uint64_t>(), mul);
4094 }
4095 
4096 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4097 
4098 template <size_t N>
4099 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N> df32,
4100                                                    Vec128<bfloat16_t, 2 * N> a,
4101                                                    Vec128<bfloat16_t, 2 * N> b,
4102                                                    const Vec128<float, N> sum0,
4103                                                    Vec128<float, N>& sum1) {
4104   // TODO(janwas): _mm_dpbf16_ps when available
4105   const Repartition<uint16_t, decltype(df32)> du16;
4106   const RebindToUnsigned<decltype(df32)> du32;
4107   const Vec128<uint16_t, 2 * N> zero = Zero(du16);
4108   // Lane order within sum0/1 is undefined, hence we can avoid the
4109   // longer-latency lane-crossing PromoteTo.
4110   const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
4111   const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
4112   const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
4113   const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
4114   sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
4115   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
4116 }
4117 
4118 // ================================================== CONVERT
4119 
4120 // ------------------------------ Promotions (part w/ narrow lanes -> full)
4121 
4122 // Unsigned: zero-extend.
4123 template <size_t N>
4124 HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N> /* tag */,
4125                                       const Vec128<uint8_t, N> v) {
4126 #if HWY_TARGET == HWY_SSSE3
4127   const __m128i zero = _mm_setzero_si128();
4128   return Vec128<uint16_t, N>{_mm_unpacklo_epi8(v.raw, zero)};
4129 #else
4130   return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(v.raw)};
4131 #endif
4132 }
4133 template <size_t N>
4134 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
4135                                       const Vec128<uint16_t, N> v) {
4136 #if HWY_TARGET == HWY_SSSE3
4137   return Vec128<uint32_t, N>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
4138 #else
4139   return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(v.raw)};
4140 #endif
4141 }
4142 template <size_t N>
4143 HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N> /* tag */,
4144                                       const Vec128<uint32_t, N> v) {
4145 #if HWY_TARGET == HWY_SSSE3
4146   return Vec128<uint64_t, N>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
4147 #else
4148   return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(v.raw)};
4149 #endif
4150 }
4151 template <size_t N>
4152 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N> /* tag */,
4153                                       const Vec128<uint8_t, N> v) {
4154 #if HWY_TARGET == HWY_SSSE3
4155   const __m128i zero = _mm_setzero_si128();
4156   const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
4157   return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
4158 #else
4159   return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(v.raw)};
4160 #endif
4161 }
4162 
4163 // Unsigned to signed: same plus cast.
4164 template <size_t N>
4165 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> di,
4166                                      const Vec128<uint8_t, N> v) {
4167   return BitCast(di, PromoteTo(Simd<uint16_t, N>(), v));
4168 }
4169 template <size_t N>
4170 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> di,
4171                                      const Vec128<uint16_t, N> v) {
4172   return BitCast(di, PromoteTo(Simd<uint32_t, N>(), v));
4173 }
4174 template <size_t N>
4175 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> di,
4176                                      const Vec128<uint8_t, N> v) {
4177   return BitCast(di, PromoteTo(Simd<uint32_t, N>(), v));
4178 }
4179 
4180 // Signed: replicate sign bit.
4181 template <size_t N>
4182 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N> /* tag */,
4183                                      const Vec128<int8_t, N> v) {
4184 #if HWY_TARGET == HWY_SSSE3
4185   return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(v.raw, v.raw)});
4186 #else
4187   return Vec128<int16_t, N>{_mm_cvtepi8_epi16(v.raw)};
4188 #endif
4189 }
4190 template <size_t N>
4191 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
4192                                      const Vec128<int16_t, N> v) {
4193 #if HWY_TARGET == HWY_SSSE3
4194   return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(v.raw, v.raw)});
4195 #else
4196   return Vec128<int32_t, N>{_mm_cvtepi16_epi32(v.raw)};
4197 #endif
4198 }
4199 template <size_t N>
4200 HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N> /* tag */,
4201                                      const Vec128<int32_t, N> v) {
4202 #if HWY_TARGET == HWY_SSSE3
4203   return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(v.raw, v.raw)});
4204 #else
4205   return Vec128<int64_t, N>{_mm_cvtepi32_epi64(v.raw)};
4206 #endif
4207 }
4208 template <size_t N>
4209 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N> /* tag */,
4210                                      const Vec128<int8_t, N> v) {
4211 #if HWY_TARGET == HWY_SSSE3
4212   const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
4213   const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
4214   return ShiftRight<24>(Vec128<int32_t, N>{x4});
4215 #else
4216   return Vec128<int32_t, N>{_mm_cvtepi8_epi32(v.raw)};
4217 #endif
4218 }
4219 
4220 // Workaround for origin tracking bug in Clang msan prior to 11.0
4221 // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
4222 #if defined(MEMORY_SANITIZER) && \
4223     (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
4224 #define HWY_INLINE_F16 HWY_NOINLINE
4225 #else
4226 #define HWY_INLINE_F16 HWY_INLINE
4227 #endif
4228 template <size_t N>
4229 HWY_INLINE_F16 Vec128<float, N> PromoteTo(Simd<float, N> df32,
4230                                           const Vec128<float16_t, N> v) {
4231 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4232   const RebindToSigned<decltype(df32)> di32;
4233   const RebindToUnsigned<decltype(df32)> du32;
4234   // Expand to u32 so we can shift.
4235   const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
4236   const auto sign = ShiftRight<15>(bits16);
4237   const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
4238   const auto mantissa = bits16 & Set(du32, 0x3FF);
4239   const auto subnormal =
4240       BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
4241                         Set(df32, 1.0f / 16384 / 1024));
4242 
4243   const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
4244   const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
4245   const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4246   const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
4247   return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4248 #else
4249   (void)df32;
4250   return Vec128<float, N>{_mm_cvtph_ps(v.raw)};
4251 #endif
4252 }
4253 
4254 template <size_t N>
4255 HWY_API Vec128<float, N> PromoteTo(Simd<float, N> df32,
4256                                    const Vec128<bfloat16_t, N> v) {
4257   const Rebind<uint16_t, decltype(df32)> du16;
4258   const RebindToSigned<decltype(df32)> di32;
4259   return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4260 }
4261 
4262 template <size_t N>
4263 HWY_API Vec128<double, N> PromoteTo(Simd<double, N> /* tag */,
4264                                     const Vec128<float, N> v) {
4265   return Vec128<double, N>{_mm_cvtps_pd(v.raw)};
4266 }
4267 
4268 template <size_t N>
4269 HWY_API Vec128<double, N> PromoteTo(Simd<double, N> /* tag */,
4270                                     const Vec128<int32_t, N> v) {
4271   return Vec128<double, N>{_mm_cvtepi32_pd(v.raw)};
4272 }
4273 
4274 // ------------------------------ Demotions (full -> part w/ narrow lanes)
4275 
4276 template <size_t N>
4277 HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N> /* tag */,
4278                                      const Vec128<int32_t, N> v) {
4279 #if HWY_TARGET == HWY_SSSE3
4280   const Simd<int32_t, N> di32;
4281   const Simd<uint16_t, N * 2> du16;
4282   const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
4283   const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
4284   const auto clamped = Or(zero_if_neg, too_big);
4285   // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
4286   alignas(16) constexpr uint16_t kLower2Bytes[16] = {
4287       0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
4288   const auto lo2 = Load(du16, kLower2Bytes);
4289   return Vec128<uint16_t, N>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
4290 #else
4291   return Vec128<uint16_t, N>{_mm_packus_epi32(v.raw, v.raw)};
4292 #endif
4293 }
4294 
4295 template <size_t N>
4296 HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N> /* tag */,
4297                                     const Vec128<int32_t, N> v) {
4298   return Vec128<int16_t, N>{_mm_packs_epi32(v.raw, v.raw)};
4299 }
4300 
4301 template <size_t N>
4302 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */,
4303                                     const Vec128<int32_t, N> v) {
4304   const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
4305   return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
4306 }
4307 
4308 template <size_t N>
4309 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N> /* tag */,
4310                                     const Vec128<int16_t, N> v) {
4311   return Vec128<uint8_t, N>{_mm_packus_epi16(v.raw, v.raw)};
4312 }
4313 
4314 template <size_t N>
4315 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */,
4316                                    const Vec128<int32_t, N> v) {
4317   const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
4318   return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
4319 }
4320 
4321 template <size_t N>
4322 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N> /* tag */,
4323                                    const Vec128<int16_t, N> v) {
4324   return Vec128<int8_t, N>{_mm_packs_epi16(v.raw, v.raw)};
4325 }
4326 
4327 template <size_t N>
4328 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N> df16,
4329                                       const Vec128<float, N> v) {
4330 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4331   const RebindToUnsigned<decltype(df16)> du16;
4332   const Rebind<uint32_t, decltype(df16)> du;
4333   const RebindToSigned<decltype(du)> di;
4334   const auto bits32 = BitCast(du, v);
4335   const auto sign = ShiftRight<31>(bits32);
4336   const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
4337   const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
4338 
4339   const auto k15 = Set(di, 15);
4340   const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
4341   const auto is_tiny = exp < Set(di, -24);
4342 
4343   const auto is_subnormal = exp < Set(di, -14);
4344   const auto biased_exp16 =
4345       BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
4346   const auto sub_exp = BitCast(du, Set(di, -14) - exp);  // [1, 11)
4347   const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
4348                      (mantissa32 >> (Set(du, 13) + sub_exp));
4349   const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
4350                                      ShiftRight<13>(mantissa32));  // <1024
4351 
4352   const auto sign16 = ShiftLeft<15>(sign);
4353   const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
4354   const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
4355   return BitCast(df16, DemoteTo(du16, bits16));
4356 #else
4357   (void)df16;
4358   return Vec128<float16_t, N>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
4359 #endif
4360 }
4361 
4362 template <size_t N>
4363 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N> dbf16,
4364                                        const Vec128<float, N> v) {
4365   // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
4366   const Rebind<int32_t, decltype(dbf16)> di32;
4367   const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
4368   const Rebind<uint16_t, decltype(dbf16)> du16;
4369   const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4370   return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4371 }
4372 
4373 template <size_t N>
4374 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
4375     Simd<bfloat16_t, 2 * N> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
4376   // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16.
4377   const RebindToUnsigned<decltype(dbf16)> du16;
4378   const Repartition<uint32_t, decltype(dbf16)> du32;
4379   const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
4380   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4381 }
4382 
4383 template <size_t N>
4384 HWY_API Vec128<float, N> DemoteTo(Simd<float, N> /* tag */,
4385                                   const Vec128<double, N> v) {
4386   return Vec128<float, N>{_mm_cvtpd_ps(v.raw)};
4387 }
4388 
4389 namespace detail {
4390 
4391 // For well-defined float->int demotion in all x86_*-inl.h.
4392 
4393 template <size_t N>
4394 HWY_INLINE auto ClampF64ToI32Max(Simd<double, N> d, decltype(Zero(d)) v)
4395     -> decltype(Zero(d)) {
4396   // The max can be exactly represented in binary64, so clamping beforehand
4397   // prevents x86 conversion from raising an exception and returning 80..00.
4398   return Min(v, Set(d, 2147483647.0));
4399 }
4400 
4401 // For ConvertTo float->int of same size, clamping before conversion would
4402 // change the result because the max integer value is not exactly representable.
4403 // Instead detect the overflow result after conversion and fix it.
4404 template <typename TI, size_t N, class DF = Simd<MakeFloat<TI>, N>>
4405 HWY_INLINE auto FixConversionOverflow(Simd<TI, N> di,
4406                                       decltype(Zero(DF())) original,
4407                                       decltype(Zero(di).raw) converted_raw)
4408     -> decltype(Zero(di)) {
4409   // Combinations of original and output sign:
4410   //   --: normal <0 or -huge_val to 80..00: OK
4411   //   -+: -0 to 0                         : OK
4412   //   +-: +huge_val to 80..00             : xor with FF..FF to get 7F..FF
4413   //   ++: normal >0                       : OK
4414   const auto converted = decltype(Zero(di)){converted_raw};
4415   const auto sign_wrong = AndNot(BitCast(di, original), converted);
4416   return BitCast(di, Xor(converted, BroadcastSignBit(sign_wrong)));
4417 }
4418 
4419 }  // namespace detail
4420 
4421 template <size_t N>
4422 HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N> /* tag */,
4423                                     const Vec128<double, N> v) {
4424   const auto clamped = detail::ClampF64ToI32Max(Simd<double, N>(), v);
4425   return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
4426 }
4427 
4428 // For already range-limited input [0, 255].
4429 template <size_t N>
4430 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
4431   const Simd<uint32_t, N> d32;
4432   const Simd<uint8_t, N * 4> d8;
4433   alignas(16) static constexpr uint32_t k8From32[4] = {
4434       0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
4435   // Also replicate bytes into all 32 bit lanes for safety.
4436   const auto quad = TableLookupBytes(v, Load(d32, k8From32));
4437   return LowerHalf(LowerHalf(BitCast(d8, quad)));
4438 }
4439 
4440 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
4441 
4442 template <size_t N>
4443 HWY_API Vec128<float, N> ConvertTo(Simd<float, N> /* tag */,
4444                                    const Vec128<int32_t, N> v) {
4445   return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
4446 }
4447 
4448 template <size_t N>
4449 HWY_API Vec128<double, N> ConvertTo(Simd<double, N> dd,
4450                                     const Vec128<int64_t, N> v) {
4451 #if HWY_TARGET <= HWY_AVX3
4452   (void)dd;
4453   return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
4454 #else
4455   // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
4456   const Repartition<uint32_t, decltype(dd)> d32;
4457   const Repartition<uint64_t, decltype(dd)> d64;
4458 
4459   // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
4460   const auto k84_63 = Set(d64, 0x4530000080000000ULL);
4461   const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
4462 
4463   // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
4464   const auto k52 = Set(d32, 0x43300000);
4465   const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
4466 
4467   const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
4468   return (v_upper - k84_63_52) + v_lower;  // order matters!
4469 #endif
4470 }
4471 
4472 // Truncates (rounds toward zero).
4473 template <size_t N>
4474 HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N> di,
4475                                      const Vec128<float, N> v) {
4476   return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw));
4477 }
4478 
4479 // Full (partial handled below)
4480 HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> di, const Vec128<double> v) {
4481 #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
4482   return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw));
4483 #elif HWY_ARCH_X86_64
4484   const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
4485   const Half<Full128<double>> dd2;
4486   const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
4487   return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1));
4488 #else
4489   using VI = decltype(Zero(di));
4490   const VI k0 = Zero(di);
4491   const VI k1 = Set(di, 1);
4492   const VI k51 = Set(di, 51);
4493 
4494   // Exponent indicates whether the number can be represented as int64_t.
4495   const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
4496   const VI exp = biased_exp - Set(di, 0x3FF);
4497   const auto in_range = exp < Set(di, 63);
4498 
4499   // If we were to cap the exponent at 51 and add 2^52, the number would be in
4500   // [2^52, 2^53) and mantissa bits could be read out directly. We need to
4501   // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
4502   // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
4503   // manually shift the mantissa into place (we already have many of the
4504   // inputs anyway).
4505   const VI shift_mnt = Max(k51 - exp, k0);
4506   const VI shift_int = Max(exp - k51, k0);
4507   const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
4508   // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
4509   const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4510   // For inputs larger than 2^52, insert zeros at the bottom.
4511   const VI shifted = int52 << shift_int;
4512   // Restore the one bit lost when shifting in the implicit 1-bit.
4513   const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4514 
4515   // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
4516   const VI sign_mask = BroadcastSignBit(BitCast(di, v));
4517   const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
4518   const VI magnitude = IfThenElse(in_range, restored, limit);
4519 
4520   // If the input was negative, negate the integer (two's complement).
4521   return (magnitude ^ sign_mask) - sign_mask;
4522 #endif
4523 }
4524 HWY_API Vec128<int64_t, 1> ConvertTo(Simd<int64_t, 1> di,
4525                                      const Vec128<double, 1> v) {
4526   // Only need to specialize for non-AVX3, 64-bit (single scalar op)
4527 #if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
4528   const Vec128<int64_t, 1> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
4529   return detail::FixConversionOverflow(di, v, i0.raw);
4530 #else
4531   (void)di;
4532   const auto full = ConvertTo(Full128<int64_t>(), Vec128<double>{v.raw});
4533   return Vec128<int64_t, 1>{full.raw};
4534 #endif
4535 }
4536 
4537 template <size_t N>
4538 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
4539   const Simd<int32_t, N> di;
4540   return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw));
4541 }
4542 
4543 // ------------------------------ Floating-point rounding (ConvertTo)
4544 
4545 #if HWY_TARGET == HWY_SSSE3
4546 
4547 // Toward nearest integer, ties to even
4548 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4549 HWY_API Vec128<T, N> Round(const Vec128<T, N> v) {
4550   // Rely on rounding after addition with a large value such that no mantissa
4551   // bits remain (assuming the current mode is nearest-even). We may need a
4552   // compiler flag for precise floating-point to prevent "optimizing" this out.
4553   const Simd<T, N> df;
4554   const auto max = Set(df, MantissaEnd<T>());
4555   const auto large = CopySignToAbs(max, v);
4556   const auto added = large + v;
4557   const auto rounded = added - large;
4558   // Keep original if NaN or the magnitude is large (already an int).
4559   return IfThenElse(Abs(v) < max, rounded, v);
4560 }
4561 
4562 namespace detail {
4563 
4564 // Truncating to integer and converting back to float is correct except when the
4565 // input magnitude is large, in which case the input was already an integer
4566 // (because mantissa >> exponent is zero).
4567 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4568 HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) {
4569   return Abs(v) < Set(Simd<T, N>(), MantissaEnd<T>());
4570 }
4571 
4572 }  // namespace detail
4573 
4574 // Toward zero, aka truncate
4575 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4576 HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
4577   const Simd<T, N> df;
4578   const RebindToSigned<decltype(df)> di;
4579 
4580   const auto integer = ConvertTo(di, v);  // round toward 0
4581   const auto int_f = ConvertTo(df, integer);
4582 
4583   return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
4584 }
4585 
4586 // Toward +infinity, aka ceiling
4587 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4588 HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
4589   const Simd<T, N> df;
4590   const RebindToSigned<decltype(df)> di;
4591 
4592   const auto integer = ConvertTo(di, v);  // round toward 0
4593   const auto int_f = ConvertTo(df, integer);
4594 
4595   // Truncating a positive non-integer ends up smaller; if so, add 1.
4596   const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
4597 
4598   return IfThenElse(detail::UseInt(v), int_f - neg1, v);
4599 }
4600 
4601 // Toward -infinity, aka floor
4602 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4603 HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
4604   const Simd<T, N> df;
4605   const RebindToSigned<decltype(df)> di;
4606 
4607   const auto integer = ConvertTo(di, v);  // round toward 0
4608   const auto int_f = ConvertTo(df, integer);
4609 
4610   // Truncating a negative non-integer ends up larger; if so, subtract 1.
4611   const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
4612 
4613   return IfThenElse(detail::UseInt(v), int_f + neg1, v);
4614 }
4615 
4616 #else
4617 
4618 // Toward nearest integer, ties to even
4619 template <size_t N>
4620 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
4621   return Vec128<float, N>{
4622       _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
4623 }
4624 template <size_t N>
4625 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
4626   return Vec128<double, N>{
4627       _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
4628 }
4629 
4630 // Toward zero, aka truncate
4631 template <size_t N>
4632 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
4633   return Vec128<float, N>{
4634       _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
4635 }
4636 template <size_t N>
4637 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
4638   return Vec128<double, N>{
4639       _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
4640 }
4641 
4642 // Toward +infinity, aka ceiling
4643 template <size_t N>
4644 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
4645   return Vec128<float, N>{
4646       _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
4647 }
4648 template <size_t N>
4649 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
4650   return Vec128<double, N>{
4651       _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
4652 }
4653 
4654 // Toward -infinity, aka floor
4655 template <size_t N>
4656 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
4657   return Vec128<float, N>{
4658       _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
4659 }
4660 template <size_t N>
4661 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
4662   return Vec128<double, N>{
4663       _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
4664 }
4665 
4666 #endif  // !HWY_SSSE3
4667 
4668 // ================================================== CRYPTO
4669 
4670 #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
4671 
4672 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
4673 #ifdef HWY_NATIVE_AES
4674 #undef HWY_NATIVE_AES
4675 #else
4676 #define HWY_NATIVE_AES
4677 #endif
4678 
4679 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
4680                                  Vec128<uint8_t> round_key) {
4681   return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
4682 }
4683 
4684 template <size_t N, HWY_IF_LE128(uint64_t, N)>
4685 HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
4686                                        Vec128<uint64_t, N> b) {
4687   return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
4688 }
4689 
4690 template <size_t N, HWY_IF_LE128(uint64_t, N)>
4691 HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
4692                                        Vec128<uint64_t, N> b) {
4693   return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
4694 }
4695 
4696 #endif  // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
4697 
4698 // ================================================== MISC
4699 
4700 #if HWY_TARGET <= HWY_AVX3
4701 
4702 // ------------------------------ LoadMaskBits
4703 
4704 // `p` points to at least 8 readable bytes, not all of which need be valid.
4705 template <typename T, size_t N, HWY_IF_LE128(T, N)>
4706 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N> /* tag */,
4707                                    const uint8_t* HWY_RESTRICT bits) {
4708   uint64_t mask_bits = 0;
4709   constexpr size_t kNumBytes = (N + 7) / 8;
4710   CopyBytes<kNumBytes>(bits, &mask_bits);
4711   if (N < 8) {
4712     mask_bits &= (1ull << N) - 1;
4713   }
4714 
4715   return Mask128<T, N>::FromBits(mask_bits);
4716 }
4717 
4718 // ------------------------------ StoreMaskBits
4719 
4720 // `p` points to at least 8 writable bytes.
4721 template <typename T, size_t N>
4722 HWY_API size_t StoreMaskBits(const Simd<T, N> /* tag */,
4723                              const Mask128<T, N> mask, uint8_t* bits) {
4724   constexpr size_t kNumBytes = (N + 7) / 8;
4725   CopyBytes<kNumBytes>(&mask.raw, bits);
4726 
4727     // Non-full byte, need to clear the undefined upper bits.
4728   if (N < 8) {
4729     const int mask = (1 << N) - 1;
4730     bits[0] = static_cast<uint8_t>(bits[0] & mask);
4731   }
4732 
4733   return kNumBytes;
4734 }
4735 
4736 // ------------------------------ Mask testing
4737 
4738 // Beware: the suffix indicates the number of mask bits, not lane size!
4739 
4740 template <typename T, size_t N>
4741 HWY_API size_t CountTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4742   const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
4743   return PopCount(mask_bits);
4744 }
4745 
4746 template <typename T, size_t N>
4747 HWY_API intptr_t FindFirstTrue(const Simd<T, N> /* tag */,
4748                                const Mask128<T, N> mask) {
4749   const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
4750   return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
4751 }
4752 
4753 template <typename T, size_t N>
4754 HWY_API bool AllFalse(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4755   const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
4756   return mask_bits == 0;
4757 }
4758 
4759 template <typename T, size_t N>
4760 HWY_API bool AllTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
4761   const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
4762   // Cannot use _kortestc because we may have less than 8 mask bits.
4763   return mask_bits == (1u << N) - 1;
4764 }
4765 
4766 // ------------------------------ Compress
4767 
4768 #if HWY_TARGET != HWY_AVX3_DL
4769 namespace detail {
4770 
4771 // Returns permutevar_epi16 indices for 16-bit Compress. Also used by x86_256.
4772 HWY_INLINE Vec128<uint16_t, 8> IndicesForCompress16(uint64_t mask_bits) {
4773   Full128<uint16_t> du16;
4774   // Table of u16 indices packed into bytes to reduce L1 usage. Will be unpacked
4775   // to u16. Ideally we would broadcast 8*3 (half of the 8 bytes currently used)
4776   // bits into each lane and then varshift, but that does not fit in 16 bits.
4777   Rebind<uint8_t, decltype(du16)> du8;
4778   alignas(16) constexpr uint8_t tbl[2048] = {
4779       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
4780       1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2,
4781       0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
4782       0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0,
4783       0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0,
4784       0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0,
4785       0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
4786       0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0,
4787       0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2,
4788       3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1,
4789       2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0,
4790       0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0,
4791       0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0,
4792       0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0,
4793       0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0,
4794       0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0,
4795       1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0,
4796       2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4,
4797       5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3,
4798       4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4,
4799       5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
4800       0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0,
4801       0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0,
4802       0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0,
4803       0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1,
4804       2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4,
4805       6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6,
4806       0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4,
4807       6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0,
4808       0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0,
4809       0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0,
4810       0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0,
4811       2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0,
4812       1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3,
4813       5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3,
4814       5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0,
4815       0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6,
4816       0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0,
4817       0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0,
4818       0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0,
4819       0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7,
4820       0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1,
4821       7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7,
4822       0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0,
4823       0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0,
4824       0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0,
4825       0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0,
4826       0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0,
4827       1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0,
4828       3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3,
4829       4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2,
4830       3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0,
4831       0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0,
4832       0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0,
4833       0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0,
4834       0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0,
4835       0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1,
4836       4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2,
4837       4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5,
4838       7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4,
4839       5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5,
4840       7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0,
4841       0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0,
4842       0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0,
4843       3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0,
4844       1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2,
4845       3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6,
4846       7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7,
4847       0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6,
4848       7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0,
4849       0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0,
4850       0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0,
4851       0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2,
4852       5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1,
4853       2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5,
4854       6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5,
4855       6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0,
4856       0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7,
4857       0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0,
4858       0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0,
4859       1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0,
4860       2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
4861   return PromoteTo(du16, Load(du8, tbl + mask_bits * 8));
4862 }
4863 
4864 }  // namespace detail
4865 #endif  // HWY_TARGET != HWY_AVX3_DL
4866 
4867 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4868 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
4869   const Simd<T, N> d;
4870   const Rebind<uint16_t, decltype(d)> du;
4871   const auto vu = BitCast(du, v);  // (required for float16_t inputs)
4872 
4873 #if HWY_TARGET == HWY_AVX3_DL  // VBMI2
4874   const Vec128<uint16_t, N> cu{_mm_maskz_compress_epi16(mask.raw, vu.raw)};
4875 #else
4876   const auto idx = detail::IndicesForCompress16(uint64_t{mask.raw});
4877   const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
4878 #endif  // HWY_TARGET != HWY_AVX3_DL
4879   return BitCast(d, cu);
4880 }
4881 
4882 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4883 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
4884   return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
4885 }
4886 
4887 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4888 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
4889   return Vec128<T, N>{_mm_maskz_compress_epi64(mask.raw, v.raw)};
4890 }
4891 
4892 template <size_t N>
4893 HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) {
4894   return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
4895 }
4896 
4897 template <size_t N>
4898 HWY_API Vec128<double, N> Compress(Vec128<double, N> v,
4899                                    Mask128<double, N> mask) {
4900   return Vec128<double, N>{_mm_maskz_compress_pd(mask.raw, v.raw)};
4901 }
4902 
4903 // ------------------------------ CompressBits (LoadMaskBits)
4904 
4905 template <typename T, size_t N>
4906 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
4907                                   const uint8_t* HWY_RESTRICT bits) {
4908   return Compress(v, LoadMaskBits(Simd<T, N>(), bits));
4909 }
4910 
4911 // ------------------------------ CompressStore
4912 
4913 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4914 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask, Simd<T, N> d,
4915                              T* HWY_RESTRICT unaligned) {
4916   const Rebind<uint16_t, decltype(d)> du;
4917   const auto vu = BitCast(du, v);  // (required for float16_t inputs)
4918 
4919   const uint64_t mask_bits{mask.raw};
4920 
4921 #if HWY_TARGET == HWY_AVX3_DL  // VBMI2
4922   _mm_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
4923 #else
4924   const auto idx = detail::IndicesForCompress16(mask_bits);
4925   const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
4926   StoreU(BitCast(d, cu), d, unaligned);
4927 #endif  // HWY_TARGET == HWY_AVX3_DL
4928   return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4929 }
4930 
4931 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4932 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
4933                              Simd<T, N> /* tag */, T* HWY_RESTRICT unaligned) {
4934   _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
4935   return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4936 }
4937 
4938 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4939 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
4940                              Simd<T, N> /* tag */, T* HWY_RESTRICT unaligned) {
4941   _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
4942   return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4943 }
4944 
4945 template <size_t N, HWY_IF_LE128(float, N)>
4946 HWY_API size_t CompressStore(Vec128<float, N> v, Mask128<float, N> mask,
4947                              Simd<float, N> /* tag */,
4948                              float* HWY_RESTRICT unaligned) {
4949   _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
4950   return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4951 }
4952 
4953 template <size_t N, HWY_IF_LE128(double, N)>
4954 HWY_API size_t CompressStore(Vec128<double, N> v, Mask128<double, N> mask,
4955                              Simd<double, N> /* tag */,
4956                              double* HWY_RESTRICT unaligned) {
4957   _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
4958   return PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
4959 }
4960 
4961 // ------------------------------ CompressBlendedStore (CompressStore)
4962 template <typename T, size_t N>
4963 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
4964                                     Simd<T, N> d, T* HWY_RESTRICT unaligned) {
4965   // AVX-512 already does the blending at no extra cost (latency 11,
4966   // rthroughput 2 - same as compress plus store).
4967   if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
4968     // We're relying on the mask to blend. Clear the undefined upper bits.
4969     if (N != 16 / sizeof(T)) {
4970       m = And(m, FirstN(d, N));
4971     }
4972     return CompressStore(v, m, d, unaligned);
4973   } else {
4974     const size_t count = CountTrue(m);
4975     const Vec128<T, N> compressed = Compress(v, m);
4976     const Vec128<T, N> prev = LoadU(d, unaligned);
4977     StoreU(IfThenElse(FirstN(d, count), compressed, prev), d, unaligned);
4978     return count;
4979   }
4980 }
4981 
4982 // ------------------------------ CompressBitsStore (LoadMaskBits)
4983 
4984 template <typename T, size_t N>
4985 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
4986                                  const uint8_t* HWY_RESTRICT bits, Simd<T, N> d,
4987                                  T* HWY_RESTRICT unaligned) {
4988   return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
4989 }
4990 
4991 #else  // AVX2 or below
4992 
4993 // ------------------------------ LoadMaskBits (TestBit)
4994 
4995 namespace detail {
4996 
4997 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
4998 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
4999   const RebindToUnsigned<decltype(d)> du;
5000   // Easier than Set(), which would require an >8-bit type, which would not
5001   // compile for T=uint8_t, N=1.
5002   const Vec128<T, N> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};
5003 
5004   // Replicate bytes 8x such that each byte contains the bit that governs it.
5005   alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
5006                                              1, 1, 1, 1, 1, 1, 1, 1};
5007   const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
5008 
5009   alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
5010                                             1, 2, 4, 8, 16, 32, 64, 128};
5011   return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
5012 }
5013 
5014 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
5015 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
5016   const RebindToUnsigned<decltype(d)> du;
5017   alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
5018   const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
5019   return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
5020 }
5021 
5022 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
5023 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
5024   const RebindToUnsigned<decltype(d)> du;
5025   alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
5026   const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
5027   return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
5028 }
5029 
5030 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
5031 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N> d, uint64_t mask_bits) {
5032   const RebindToUnsigned<decltype(d)> du;
5033   alignas(16) constexpr uint64_t kBit[8] = {1, 2};
5034   return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
5035 }
5036 
5037 }  // namespace detail
5038 
5039 // `p` points to at least 8 readable bytes, not all of which need be valid.
5040 template <typename T, size_t N, HWY_IF_LE128(T, N)>
5041 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N> d,
5042                                    const uint8_t* HWY_RESTRICT bits) {
5043   uint64_t mask_bits = 0;
5044   constexpr size_t kNumBytes = (N + 7) / 8;
5045   CopyBytes<kNumBytes>(bits, &mask_bits);
5046   if (N < 8) {
5047     mask_bits &= (1ull << N) - 1;
5048   }
5049 
5050   return detail::LoadMaskBits(d, mask_bits);
5051 }
5052 
5053 // ------------------------------ StoreMaskBits
5054 
5055 namespace detail {
5056 
5057 constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
5058   return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
5059 }
5060 
5061 template <typename T, size_t N>
5062 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
5063                                  const Mask128<T, N> mask) {
5064   const Simd<T, N> d;
5065   const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
5066   return U64FromInt(_mm_movemask_epi8(sign_bits));
5067 }
5068 
5069 template <typename T, size_t N>
5070 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
5071                                  const Mask128<T, N> mask) {
5072   // Remove useless lower half of each u16 while preserving the sign bit.
5073   const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
5074   return U64FromInt(_mm_movemask_epi8(sign_bits));
5075 }
5076 
5077 template <typename T, size_t N>
5078 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
5079                                  const Mask128<T, N> mask) {
5080   const Simd<T, N> d;
5081   const Simd<float, N> df;
5082   const auto sign_bits = BitCast(df, VecFromMask(d, mask));
5083   return U64FromInt(_mm_movemask_ps(sign_bits.raw));
5084 }
5085 
5086 template <typename T, size_t N>
5087 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
5088                                  const Mask128<T, N> mask) {
5089   const Simd<T, N> d;
5090   const Simd<double, N> df;
5091   const auto sign_bits = BitCast(df, VecFromMask(d, mask));
5092   return U64FromInt(_mm_movemask_pd(sign_bits.raw));
5093 }
5094 
5095 // Returns the lowest N of the _mm_movemask* bits.
5096 template <typename T, size_t N>
5097 constexpr uint64_t OnlyActive(uint64_t mask_bits) {
5098   return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
5099 }
5100 
5101 template <typename T, size_t N>
5102 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
5103   return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
5104 }
5105 
5106 }  // namespace detail
5107 
5108 // `p` points to at least 8 writable bytes.
5109 template <typename T, size_t N>
5110 HWY_API size_t StoreMaskBits(const Simd<T, N> /* tag */,
5111                              const Mask128<T, N> mask, uint8_t* bits) {
5112   constexpr size_t kNumBytes = (N + 7) / 8;
5113   const uint64_t mask_bits = detail::BitsFromMask(mask);
5114   CopyBytes<kNumBytes>(&mask_bits, bits);
5115   return kNumBytes;
5116 }
5117 
5118 // ------------------------------ Mask testing
5119 
5120 template <typename T, size_t N>
5121 HWY_API bool AllFalse(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
5122   // Cheaper than PTEST, which is 2 uop / 3L.
5123   return detail::BitsFromMask(mask) == 0;
5124 }
5125 
5126 template <typename T, size_t N>
5127 HWY_API bool AllTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
5128   constexpr uint64_t kAllBits =
5129       detail::OnlyActive<T, N>((1ull << (16 / sizeof(T))) - 1);
5130   return detail::BitsFromMask(mask) == kAllBits;
5131 }
5132 
5133 template <typename T, size_t N>
5134 HWY_API size_t CountTrue(const Simd<T, N> /* tag */, const Mask128<T, N> mask) {
5135   return PopCount(detail::BitsFromMask(mask));
5136 }
5137 
5138 template <typename T, size_t N>
5139 HWY_API intptr_t FindFirstTrue(const Simd<T, N> /* tag */,
5140                                const Mask128<T, N> mask) {
5141   const uint64_t mask_bits = detail::BitsFromMask(mask);
5142   return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
5143 }
5144 
5145 // ------------------------------ Compress, CompressBits
5146 
5147 namespace detail {
5148 
5149 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
5150 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) {
5151   HWY_DASSERT(mask_bits < 256);
5152   const Rebind<uint8_t, decltype(d)> d8;
5153   const Simd<uint16_t, N> du;
5154 
5155   // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
5156   // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
5157   // 8 mask bits). Loading them directly would require 4 KiB. We can instead
5158   // store lane indices and convert to byte indices (2*lane + 0..1), with the
5159   // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
5160   // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
5161   // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
5162   // is likely more costly than the higher cache footprint from storing bytes.
5163   alignas(16) constexpr uint8_t table[2048] = {
5164       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
5165       0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
5166       0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
5167       0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
5168       0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
5169       6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
5170       0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
5171       0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
5172       2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
5173       0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
5174       0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
5175       0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
5176       0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
5177       6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
5178       8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
5179       0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
5180       4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
5181       10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
5182       0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
5183       0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
5184       0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
5185       4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
5186       0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
5187       0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
5188       2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
5189       10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
5190       0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
5191       0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
5192       0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
5193       0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
5194       0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
5195       0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
5196       6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
5197       12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
5198       0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
5199       0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
5200       0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
5201       8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
5202       0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
5203       0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
5204       2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
5205       8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
5206       12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
5207       0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
5208       0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
5209       10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
5210       12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
5211       0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
5212       4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
5213       6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
5214       0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
5215       0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
5216       0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
5217       4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
5218       12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
5219       0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
5220       2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
5221       0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
5222       0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
5223       0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
5224       0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
5225       14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
5226       0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
5227       0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
5228       8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
5229       14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
5230       0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
5231       0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
5232       0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
5233       6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
5234       14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
5235       0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
5236       2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
5237       14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
5238       0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
5239       0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
5240       0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
5241       6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
5242       10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
5243       0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
5244       4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
5245       8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
5246       0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
5247       0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
5248       0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
5249       4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
5250       0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
5251       0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
5252       2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
5253       14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
5254       0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
5255       0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
5256       0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
5257       12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
5258       14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
5259       0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
5260       6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
5261       8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
5262       14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
5263       0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
5264       0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
5265       10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
5266       14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
5267       0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
5268       2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
5269       10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
5270       12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
5271       0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
5272       0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
5273       8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
5274       10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
5275       0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
5276       4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
5277       6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};
5278 
5279   const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
5280   const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5281   return BitCast(d, pairs + Set(du, 0x0100));
5282 }
5283 
5284 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
5285 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) {
5286   HWY_DASSERT(mask_bits < 16);
5287 
5288   // There are only 4 lanes, so we can afford to load the index vector directly.
5289   alignas(16) constexpr uint8_t packed_array[256] = {
5290       0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
5291       0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
5292       4,  5,  6,  7,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
5293       0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  0,  1,  2,  3,  //
5294       8,  9,  10, 11, 0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
5295       0,  1,  2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  0,  1,  2,  3,  //
5296       4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  0,  1,  2,  3,  //
5297       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  //
5298       12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
5299       0,  1,  2,  3,  12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  //
5300       4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  //
5301       0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  //
5302       8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  //
5303       0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  //
5304       4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  //
5305       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
5306 
5307   const Repartition<uint8_t, decltype(d)> d8;
5308   return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
5309 }
5310 
5311 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
5312 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N> d, uint64_t mask_bits) {
5313   HWY_DASSERT(mask_bits < 4);
5314 
5315   // There are only 2 lanes, so we can afford to load the index vector directly.
5316   alignas(16) constexpr uint8_t packed_array[64] = {
5317       0, 1, 2,  3,  4,  5,  6,  7,  0, 1, 2,  3,  4,  5,  6,  7,  //
5318       0, 1, 2,  3,  4,  5,  6,  7,  0, 1, 2,  3,  4,  5,  6,  7,  //
5319       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,  //
5320       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
5321 
5322   const Repartition<uint8_t, decltype(d)> d8;
5323   return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
5324 }
5325 
5326 }  // namespace detail
5327 
5328 template <typename T, size_t N>
5329 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> m) {
5330   const Simd<T, N> d;
5331   const RebindToUnsigned<decltype(d)> du;
5332 
5333   const uint64_t mask_bits = detail::BitsFromMask(m);
5334   HWY_DASSERT(mask_bits < (1ull << N));
5335 
5336   const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5337   return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5338 }
5339 
5340 template <typename T, size_t N>
5341 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
5342                                   const uint8_t* HWY_RESTRICT bits) {
5343   const Simd<T, N> d;
5344   const RebindToUnsigned<decltype(d)> du;
5345 
5346   uint64_t mask_bits = 0;
5347   constexpr size_t kNumBytes = (N + 7) / 8;
5348   CopyBytes<kNumBytes>(bits, &mask_bits);
5349   if (N < 8) {
5350     mask_bits &= (1ull << N) - 1;
5351   }
5352 
5353   const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5354   return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5355 }
5356 
5357 // ------------------------------ CompressStore, CompressBitsStore
5358 
5359 template <typename T, size_t N>
5360 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N> d,
5361                              T* HWY_RESTRICT unaligned) {
5362   const RebindToUnsigned<decltype(d)> du;
5363 
5364   const uint64_t mask_bits = detail::BitsFromMask(m);
5365   HWY_DASSERT(mask_bits < (1ull << N));
5366 
5367   // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
5368   const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5369   const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5370   StoreU(compressed, d, unaligned);
5371   return PopCount(mask_bits);
5372 }
5373 
5374 template <typename T, size_t N>
5375 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
5376                                     Simd<T, N> d, T* HWY_RESTRICT unaligned) {
5377   const RebindToUnsigned<decltype(d)> du;
5378 
5379   const uint64_t mask_bits = detail::BitsFromMask(m);
5380   HWY_DASSERT(mask_bits < (1ull << N));
5381   const size_t count = PopCount(mask_bits);
5382 
5383   // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
5384   const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5385   const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5386 
5387   const Vec128<T, N> prev = LoadU(d, unaligned);
5388   StoreU(IfThenElse(FirstN(d, count), compressed, prev), d, unaligned);
5389   return count;
5390 }
5391 
5392 template <typename T, size_t N>
5393 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
5394                                  const uint8_t* HWY_RESTRICT bits, Simd<T, N> d,
5395                                  T* HWY_RESTRICT unaligned) {
5396   const RebindToUnsigned<decltype(d)> du;
5397 
5398   uint64_t mask_bits = 0;
5399   constexpr size_t kNumBytes = (N + 7) / 8;
5400   CopyBytes<kNumBytes>(bits, &mask_bits);
5401   if (N < 8) {
5402     mask_bits &= (1ull << N) - 1;
5403   }
5404 
5405   // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
5406   const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5407   const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5408   StoreU(compressed, d, unaligned);
5409   return PopCount(mask_bits);
5410 }
5411 
5412 #endif  // HWY_TARGET <= HWY_AVX3
5413 
5414 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
5415 // TableLookupBytes)
5416 
5417 // 128 bits
5418 HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
5419                                const Vec128<uint8_t> v1,
5420                                const Vec128<uint8_t> v2, Full128<uint8_t> d,
5421                                uint8_t* HWY_RESTRICT unaligned) {
5422   const auto k5 = Set(d, 5);
5423   const auto k6 = Set(d, 6);
5424 
5425   // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
5426   // 0x80 so lanes to be filled from other vectors are 0 for blending.
5427   alignas(16) static constexpr uint8_t tbl_r0[16] = {
5428       0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
5429       3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5430   alignas(16) static constexpr uint8_t tbl_g0[16] = {
5431       0x80, 0, 0x80, 0x80, 1, 0x80,  //
5432       0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5433   const auto shuf_r0 = Load(d, tbl_r0);
5434   const auto shuf_g0 = Load(d, tbl_g0);  // cannot reuse r0 due to 5 in MSB
5435   const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
5436   const auto r0 = TableLookupBytes(v0, shuf_r0);  // 5..4..3..2..1..0
5437   const auto g0 = TableLookupBytes(v1, shuf_g0);  // ..4..3..2..1..0.
5438   const auto b0 = TableLookupBytes(v2, shuf_b0);  // .4..3..2..1..0..
5439   const auto int0 = r0 | g0 | b0;
5440   StoreU(int0, d, unaligned + 0 * 16);
5441 
5442   // Second vector: g10,r10, bgr[9:6], b5,g5
5443   const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
5444   const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
5445   const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
5446   const auto r1 = TableLookupBytes(v0, shuf_r1);
5447   const auto g1 = TableLookupBytes(v1, shuf_g1);
5448   const auto b1 = TableLookupBytes(v2, shuf_b1);
5449   const auto int1 = r1 | g1 | b1;
5450   StoreU(int1, d, unaligned + 1 * 16);
5451 
5452   // Third vector: bgr[15:11], b10
5453   const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
5454   const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
5455   const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
5456   const auto r2 = TableLookupBytes(v0, shuf_r2);
5457   const auto g2 = TableLookupBytes(v1, shuf_g2);
5458   const auto b2 = TableLookupBytes(v2, shuf_b2);
5459   const auto int2 = r2 | g2 | b2;
5460   StoreU(int2, d, unaligned + 2 * 16);
5461 }
5462 
5463 // 64 bits
5464 HWY_API void StoreInterleaved3(const Vec128<uint8_t, 8> v0,
5465                                const Vec128<uint8_t, 8> v1,
5466                                const Vec128<uint8_t, 8> v2, Simd<uint8_t, 8> d,
5467                                uint8_t* HWY_RESTRICT unaligned) {
5468   // Use full vectors for the shuffles and first result.
5469   const Full128<uint8_t> d_full;
5470   const auto k5 = Set(d_full, 5);
5471   const auto k6 = Set(d_full, 6);
5472 
5473   const Vec128<uint8_t> full_a{v0.raw};
5474   const Vec128<uint8_t> full_b{v1.raw};
5475   const Vec128<uint8_t> full_c{v2.raw};
5476 
5477   // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
5478   // 0x80 so lanes to be filled from other vectors are 0 for blending.
5479   alignas(16) static constexpr uint8_t tbl_r0[16] = {
5480       0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
5481       3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5482   alignas(16) static constexpr uint8_t tbl_g0[16] = {
5483       0x80, 0, 0x80, 0x80, 1, 0x80,  //
5484       0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5485   const auto shuf_r0 = Load(d_full, tbl_r0);
5486   const auto shuf_g0 = Load(d_full, tbl_g0);  // cannot reuse r0 due to 5 in MSB
5487   const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
5488   const auto r0 = TableLookupBytes(full_a, shuf_r0);  // 5..4..3..2..1..0
5489   const auto g0 = TableLookupBytes(full_b, shuf_g0);  // ..4..3..2..1..0.
5490   const auto b0 = TableLookupBytes(full_c, shuf_b0);  // .4..3..2..1..0..
5491   const auto int0 = r0 | g0 | b0;
5492   StoreU(int0, d_full, unaligned + 0 * 16);
5493 
5494   // Second (HALF) vector: bgr[7:6], b5,g5
5495   const auto shuf_r1 = shuf_b0 + k6;  // ..7..6..
5496   const auto shuf_g1 = shuf_r0 + k5;  // .7..6..5
5497   const auto shuf_b1 = shuf_g0 + k5;  // 7..6..5.
5498   const auto r1 = TableLookupBytes(full_a, shuf_r1);
5499   const auto g1 = TableLookupBytes(full_b, shuf_g1);
5500   const auto b1 = TableLookupBytes(full_c, shuf_b1);
5501   const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
5502   StoreU(int1, d, unaligned + 1 * 16);
5503 }
5504 
5505 // <= 32 bits
5506 template <size_t N, HWY_IF_LE32(uint8_t, N)>
5507 HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
5508                                const Vec128<uint8_t, N> v1,
5509                                const Vec128<uint8_t, N> v2,
5510                                Simd<uint8_t, N> /*tag*/,
5511                                uint8_t* HWY_RESTRICT unaligned) {
5512   // Use full vectors for the shuffles and result.
5513   const Full128<uint8_t> d_full;
5514 
5515   const Vec128<uint8_t> full_a{v0.raw};
5516   const Vec128<uint8_t> full_b{v1.raw};
5517   const Vec128<uint8_t> full_c{v2.raw};
5518 
5519   // Shuffle (v0,v1,v2) vector bytes to bgr[3:0].
5520   // 0x80 so lanes to be filled from other vectors are 0 for blending.
5521   alignas(16) static constexpr uint8_t tbl_r0[16] = {
5522       0,    0x80, 0x80, 1,   0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,  //
5523       0x80, 0x80, 0x80, 0x80};
5524   const auto shuf_r0 = Load(d_full, tbl_r0);
5525   const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
5526   const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
5527   const auto r0 = TableLookupBytes(full_a, shuf_r0);  // ......3..2..1..0
5528   const auto g0 = TableLookupBytes(full_b, shuf_g0);  // .....3..2..1..0.
5529   const auto b0 = TableLookupBytes(full_c, shuf_b0);  // ....3..2..1..0..
5530   const auto int0 = r0 | g0 | b0;
5531   alignas(16) uint8_t buf[16];
5532   StoreU(int0, d_full, buf);
5533   CopyBytes<N * 3>(buf, unaligned);
5534 }
5535 
5536 // ------------------------------ StoreInterleaved4
5537 
5538 // 128 bits
5539 HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
5540                                const Vec128<uint8_t> v1,
5541                                const Vec128<uint8_t> v2,
5542                                const Vec128<uint8_t> v3, Full128<uint8_t> d8,
5543                                uint8_t* HWY_RESTRICT unaligned) {
5544   const RepartitionToWide<decltype(d8)> d16;
5545   const RepartitionToWide<decltype(d16)> d32;
5546   // let a,b,c,d denote v0..3.
5547   const auto ba0 = ZipLower(d16, v0, v1);  // b7 a7 .. b0 a0
5548   const auto dc0 = ZipLower(d16, v2, v3);  // d7 c7 .. d0 c0
5549   const auto ba8 = ZipUpper(d16, v0, v1);
5550   const auto dc8 = ZipUpper(d16, v2, v3);
5551   const auto dcba_0 = ZipLower(d32, ba0, dc0);  // d..a3 d..a0
5552   const auto dcba_4 = ZipUpper(d32, ba0, dc0);  // d..a7 d..a4
5553   const auto dcba_8 = ZipLower(d32, ba8, dc8);  // d..aB d..a8
5554   const auto dcba_C = ZipUpper(d32, ba8, dc8);  // d..aF d..aC
5555   StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
5556   StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
5557   StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
5558   StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
5559 }
5560 
5561 // 64 bits
5562 HWY_API void StoreInterleaved4(const Vec128<uint8_t, 8> in0,
5563                                const Vec128<uint8_t, 8> in1,
5564                                const Vec128<uint8_t, 8> in2,
5565                                const Vec128<uint8_t, 8> in3,
5566                                Simd<uint8_t, 8> /*tag*/,
5567                                uint8_t* HWY_RESTRICT unaligned) {
5568   // Use full vectors to reduce the number of stores.
5569   const Full128<uint8_t> d_full8;
5570   const RepartitionToWide<decltype(d_full8)> d16;
5571   const RepartitionToWide<decltype(d16)> d32;
5572   const Vec128<uint8_t> v0{in0.raw};
5573   const Vec128<uint8_t> v1{in1.raw};
5574   const Vec128<uint8_t> v2{in2.raw};
5575   const Vec128<uint8_t> v3{in3.raw};
5576   // let a,b,c,d denote v0..3.
5577   const auto ba0 = ZipLower(d16, v0, v1);       // b7 a7 .. b0 a0
5578   const auto dc0 = ZipLower(d16, v2, v3);       // d7 c7 .. d0 c0
5579   const auto dcba_0 = ZipLower(d32, ba0, dc0);  // d..a3 d..a0
5580   const auto dcba_4 = ZipUpper(d32, ba0, dc0);  // d..a7 d..a4
5581   StoreU(BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
5582   StoreU(BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
5583 }
5584 
5585 // <= 32 bits
5586 template <size_t N, HWY_IF_LE32(uint8_t, N)>
5587 HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
5588                                const Vec128<uint8_t, N> in1,
5589                                const Vec128<uint8_t, N> in2,
5590                                const Vec128<uint8_t, N> in3,
5591                                Simd<uint8_t, N> /*tag*/,
5592                                uint8_t* HWY_RESTRICT unaligned) {
5593   // Use full vectors to reduce the number of stores.
5594   const Full128<uint8_t> d_full8;
5595   const RepartitionToWide<decltype(d_full8)> d16;
5596   const RepartitionToWide<decltype(d16)> d32;
5597   const Vec128<uint8_t> v0{in0.raw};
5598   const Vec128<uint8_t> v1{in1.raw};
5599   const Vec128<uint8_t> v2{in2.raw};
5600   const Vec128<uint8_t> v3{in3.raw};
5601   // let a,b,c,d denote v0..3.
5602   const auto ba0 = ZipLower(d16, v0, v1);       // b3 a3 .. b0 a0
5603   const auto dc0 = ZipLower(d16, v2, v3);       // d3 c3 .. d0 c0
5604   const auto dcba_0 = ZipLower(d32, ba0, dc0);  // d..a3 d..a0
5605   alignas(16) uint8_t buf[16];
5606   StoreU(BitCast(d_full8, dcba_0), d_full8, buf);
5607   CopyBytes<4 * N>(buf, unaligned);
5608 }
5609 
5610 // ------------------------------ Reductions
5611 
5612 namespace detail {
5613 
5614 // N=1 for any T: no-op
5615 template <typename T>
5616 HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
5617                                    const Vec128<T, 1> v) {
5618   return v;
5619 }
5620 template <typename T>
5621 HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
5622                                    const Vec128<T, 1> v) {
5623   return v;
5624 }
5625 template <typename T>
5626 HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
5627                                    const Vec128<T, 1> v) {
5628   return v;
5629 }
5630 
5631 // u32/i32/f32:
5632 
5633 // N=2
5634 template <typename T>
5635 HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
5636                                    const Vec128<T, 2> v10) {
5637   return v10 + Shuffle2301(v10);
5638 }
5639 template <typename T>
5640 HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
5641                                    const Vec128<T, 2> v10) {
5642   return Min(v10, Shuffle2301(v10));
5643 }
5644 template <typename T>
5645 HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
5646                                    const Vec128<T, 2> v10) {
5647   return Max(v10, Shuffle2301(v10));
5648 }
5649 
5650 // N=4 (full)
5651 template <typename T>
5652 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
5653                                 const Vec128<T> v3210) {
5654   const Vec128<T> v1032 = Shuffle1032(v3210);
5655   const Vec128<T> v31_20_31_20 = v3210 + v1032;
5656   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5657   return v20_31_20_31 + v31_20_31_20;
5658 }
5659 template <typename T>
5660 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
5661                                 const Vec128<T> v3210) {
5662   const Vec128<T> v1032 = Shuffle1032(v3210);
5663   const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
5664   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5665   return Min(v20_31_20_31, v31_20_31_20);
5666 }
5667 template <typename T>
5668 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
5669                                 const Vec128<T> v3210) {
5670   const Vec128<T> v1032 = Shuffle1032(v3210);
5671   const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
5672   const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
5673   return Max(v20_31_20_31, v31_20_31_20);
5674 }
5675 
5676 // u64/i64/f64:
5677 
5678 // N=2 (full)
5679 template <typename T>
5680 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
5681                                 const Vec128<T> v10) {
5682   const Vec128<T> v01 = Shuffle01(v10);
5683   return v10 + v01;
5684 }
5685 template <typename T>
5686 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
5687                                 const Vec128<T> v10) {
5688   const Vec128<T> v01 = Shuffle01(v10);
5689   return Min(v10, v01);
5690 }
5691 template <typename T>
5692 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
5693                                 const Vec128<T> v10) {
5694   const Vec128<T> v01 = Shuffle01(v10);
5695   return Max(v10, v01);
5696 }
5697 
5698 // u16/i16
5699 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
5700 HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
5701   const Repartition<int32_t, Simd<T, N>> d32;
5702   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5703   const auto odd = ShiftRight<16>(BitCast(d32, v));
5704   const auto min = MinOfLanes(d32, Min(even, odd));
5705   // Also broadcast into odd lanes.
5706   return BitCast(Simd<T, N>(), Or(min, ShiftLeft<16>(min)));
5707 }
5708 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
5709 HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
5710   const Repartition<int32_t, Simd<T, N>> d32;
5711   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
5712   const auto odd = ShiftRight<16>(BitCast(d32, v));
5713   const auto min = MaxOfLanes(d32, Max(even, odd));
5714   // Also broadcast into odd lanes.
5715   return BitCast(Simd<T, N>(), Or(min, ShiftLeft<16>(min)));
5716 }
5717 
5718 }  // namespace detail
5719 
5720 // Supported for u/i/f 32/64. Returns the same value in each lane.
5721 template <typename T, size_t N>
5722 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) {
5723   return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
5724 }
5725 template <typename T, size_t N>
5726 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) {
5727   return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
5728 }
5729 template <typename T, size_t N>
5730 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N> /* tag */, const Vec128<T, N> v) {
5731   return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
5732 }
5733 
5734 // ================================================== DEPRECATED
5735 
5736 template <typename T, size_t N>
5737 HWY_API size_t StoreMaskBits(const Mask128<T, N> mask, uint8_t* bits) {
5738   return StoreMaskBits(Simd<T, N>(), mask, bits);
5739 }
5740 
5741 template <typename T, size_t N>
5742 HWY_API bool AllTrue(const Mask128<T, N> mask) {
5743   return AllTrue(Simd<T, N>(), mask);
5744 }
5745 
5746 template <typename T, size_t N>
5747 HWY_API bool AllFalse(const Mask128<T, N> mask) {
5748   return AllFalse(Simd<T, N>(), mask);
5749 }
5750 
5751 template <typename T, size_t N>
5752 HWY_API size_t CountTrue(const Mask128<T, N> mask) {
5753   return CountTrue(Simd<T, N>(), mask);
5754 }
5755 
5756 template <typename T, size_t N>
5757 HWY_API Vec128<T, N> SumOfLanes(const Vec128<T, N> v) {
5758   return SumOfLanes(Simd<T, N>(), v);
5759 }
5760 template <typename T, size_t N>
5761 HWY_API Vec128<T, N> MinOfLanes(const Vec128<T, N> v) {
5762   return MinOfLanes(Simd<T, N>(), v);
5763 }
5764 template <typename T, size_t N>
5765 HWY_API Vec128<T, N> MaxOfLanes(const Vec128<T, N> v) {
5766   return MaxOfLanes(Simd<T, N>(), v);
5767 }
5768 
5769 template <typename T, size_t N>
5770 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Vec128<T, N> v) {
5771   return UpperHalf(Half<Simd<T, N>>(), v);
5772 }
5773 
5774 template <int kBytes, typename T, size_t N>
5775 HWY_API Vec128<T, N> ShiftRightBytes(const Vec128<T, N> v) {
5776   return ShiftRightBytes<kBytes>(Simd<T, N>(), v);
5777 }
5778 
5779 template <int kLanes, typename T, size_t N>
5780 HWY_API Vec128<T, N> ShiftRightLanes(const Vec128<T, N> v) {
5781   return ShiftRightLanes<kLanes>(Simd<T, N>(), v);
5782 }
5783 
5784 template <size_t kBytes, typename T, size_t N>
5785 HWY_API Vec128<T, N> CombineShiftRightBytes(Vec128<T, N> hi, Vec128<T, N> lo) {
5786   return CombineShiftRightBytes<kBytes>(Simd<T, N>(), hi, lo);
5787 }
5788 
5789 template <typename T, size_t N>
5790 HWY_API Vec128<T, N> InterleaveUpper(Vec128<T, N> a, Vec128<T, N> b) {
5791   return InterleaveUpper(Simd<T, N>(), a, b);
5792 }
5793 
5794 template <typename T, size_t N, class D = Simd<T, N>>
5795 HWY_API VFromD<RepartitionToWide<D>> ZipUpper(Vec128<T, N> a, Vec128<T, N> b) {
5796   return InterleaveUpper(RepartitionToWide<D>(), a, b);
5797 }
5798 
5799 template <typename T, size_t N2>
5800 HWY_API Vec128<T, N2 * 2> Combine(Vec128<T, N2> hi2, Vec128<T, N2> lo2) {
5801   return Combine(Simd<T, N2 * 2>(), hi2, lo2);
5802 }
5803 
5804 template <typename T, size_t N2, HWY_IF_LE64(T, N2)>
5805 HWY_API Vec128<T, N2 * 2> ZeroExtendVector(Vec128<T, N2> lo) {
5806   return ZeroExtendVector(Simd<T, N2 * 2>(), lo);
5807 }
5808 
5809 template <typename T, size_t N>
5810 HWY_API Vec128<T, N> ConcatLowerLower(Vec128<T, N> hi, Vec128<T, N> lo) {
5811   return ConcatLowerLower(Simd<T, N>(), hi, lo);
5812 }
5813 
5814 template <typename T, size_t N>
5815 HWY_API Vec128<T, N> ConcatUpperUpper(Vec128<T, N> hi, Vec128<T, N> lo) {
5816   return ConcatUpperUpper(Simd<T, N>(), hi, lo);
5817 }
5818 
5819 template <typename T, size_t N>
5820 HWY_API Vec128<T, N> ConcatLowerUpper(const Vec128<T, N> hi,
5821                                       const Vec128<T, N> lo) {
5822   return ConcatLowerUpper(Simd<T, N>(), hi, lo);
5823 }
5824 
5825 template <typename T, size_t N>
5826 HWY_API Vec128<T, N> ConcatUpperLower(Vec128<T, N> hi, Vec128<T, N> lo) {
5827   return ConcatUpperLower(Simd<T, N>(), hi, lo);
5828 }
5829 
5830 // ================================================== Operator wrapper
5831 
5832 // These apply to all x86_*-inl.h because there are no restrictions on V.
5833 
5834 template <class V>
5835 HWY_API V Add(V a, V b) {
5836   return a + b;
5837 }
5838 template <class V>
5839 HWY_API V Sub(V a, V b) {
5840   return a - b;
5841 }
5842 
5843 template <class V>
5844 HWY_API V Mul(V a, V b) {
5845   return a * b;
5846 }
5847 template <class V>
5848 HWY_API V Div(V a, V b) {
5849   return a / b;
5850 }
5851 
5852 template <class V>
5853 V Shl(V a, V b) {
5854   return a << b;
5855 }
5856 template <class V>
5857 V Shr(V a, V b) {
5858   return a >> b;
5859 }
5860 
5861 template <class V>
5862 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
5863   return a == b;
5864 }
5865 template <class V>
5866 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
5867   return a != b;
5868 }
5869 template <class V>
5870 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
5871   return a < b;
5872 }
5873 
5874 template <class V>
5875 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
5876   return a > b;
5877 }
5878 template <class V>
5879 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
5880   return a >= b;
5881 }
5882 
5883 template <class V>
5884 HWY_API auto Le(V a, V b) -> decltype(a == b) {
5885   return a <= b;
5886 }
5887 
5888 // NOLINTNEXTLINE(google-readability-namespace-comments)
5889 }  // namespace HWY_NAMESPACE
5890 }  // namespace hwy
5891 HWY_AFTER_NAMESPACE();
5892