1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Single-element vectors and operations.
16 // External include guard in highway.h - see comment there.
17 
18 #include <stddef.h>
19 #include <stdint.h>
20 
21 #include "hwy/base.h"
22 #include "hwy/ops/shared-inl.h"
23 
24 HWY_BEFORE_NAMESPACE();
25 namespace hwy {
26 namespace HWY_NAMESPACE {
27 
28 // Single instruction, single data.
29 template <typename T>
30 using Sisd = Simd<T, 1, 0>;
31 
32 // (Wrapper class required for overloading comparison operators.)
33 template <typename T>
34 struct Vec1 {
35   HWY_INLINE Vec1() = default;
36   Vec1(const Vec1&) = default;
37   Vec1& operator=(const Vec1&) = default;
Vec1Vec138   HWY_INLINE explicit Vec1(const T t) : raw(t) {}
39 
40   HWY_INLINE Vec1& operator*=(const Vec1 other) {
41     return *this = (*this * other);
42   }
43   HWY_INLINE Vec1& operator/=(const Vec1 other) {
44     return *this = (*this / other);
45   }
46   HWY_INLINE Vec1& operator+=(const Vec1 other) {
47     return *this = (*this + other);
48   }
49   HWY_INLINE Vec1& operator-=(const Vec1 other) {
50     return *this = (*this - other);
51   }
52   HWY_INLINE Vec1& operator&=(const Vec1 other) {
53     return *this = (*this & other);
54   }
55   HWY_INLINE Vec1& operator|=(const Vec1 other) {
56     return *this = (*this | other);
57   }
58   HWY_INLINE Vec1& operator^=(const Vec1 other) {
59     return *this = (*this ^ other);
60   }
61 
62   T raw;
63 };
64 
65 // 0 or FF..FF, same size as Vec1.
66 template <typename T>
67 class Mask1 {
68   using Raw = hwy::MakeUnsigned<T>;
69 
70  public:
FromBool(bool b)71   static HWY_INLINE Mask1<T> FromBool(bool b) {
72     Mask1<T> mask;
73     mask.bits = b ? ~Raw(0) : 0;
74     return mask;
75   }
76 
77   Raw bits;
78 };
79 
80 namespace detail {
81 
82 // Deduce Sisd<T> from Vec1<T>
83 struct Deduce1 {
84   template <typename T>
operatorDeduce185   Sisd<T> operator()(Vec1<T>) const {
86     return Sisd<T>();
87   }
88 };
89 
90 }  // namespace detail
91 
92 template <class V>
93 using DFromV = decltype(detail::Deduce1()(V()));
94 
95 template <class V>
96 using TFromV = TFromD<DFromV<V>>;
97 
98 // ------------------------------ BitCast
99 
100 template <typename T, typename FromT>
BitCast(Sisd<T>,Vec1<FromT> v)101 HWY_API Vec1<T> BitCast(Sisd<T> /* tag */, Vec1<FromT> v) {
102   static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
103   T to;
104   CopyBytes<sizeof(FromT)>(&v.raw, &to);
105   return Vec1<T>(to);
106 }
107 
108 // ------------------------------ Set
109 
110 template <typename T>
Zero(Sisd<T>)111 HWY_API Vec1<T> Zero(Sisd<T> /* tag */) {
112   return Vec1<T>(T(0));
113 }
114 
115 template <typename T, typename T2>
Set(Sisd<T>,const T2 t)116 HWY_API Vec1<T> Set(Sisd<T> /* tag */, const T2 t) {
117   return Vec1<T>(static_cast<T>(t));
118 }
119 
120 template <typename T>
Undefined(Sisd<T> d)121 HWY_API Vec1<T> Undefined(Sisd<T> d) {
122   return Zero(d);
123 }
124 
125 template <typename T, typename T2>
Iota(const Sisd<T>,const T2 first)126 HWY_API Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
127   return Vec1<T>(static_cast<T>(first));
128 }
129 
130 // ================================================== LOGICAL
131 
132 // ------------------------------ Not
133 
134 template <typename T>
Not(const Vec1<T> v)135 HWY_API Vec1<T> Not(const Vec1<T> v) {
136   using TU = MakeUnsigned<T>;
137   const Sisd<TU> du;
138   return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw)));
139 }
140 
141 // ------------------------------ And
142 
143 template <typename T>
And(const Vec1<T> a,const Vec1<T> b)144 HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
145   using TU = MakeUnsigned<T>;
146   const Sisd<TU> du;
147   return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
148 }
149 template <typename T>
150 HWY_API Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) {
151   return And(a, b);
152 }
153 
154 // ------------------------------ AndNot
155 
156 template <typename T>
AndNot(const Vec1<T> a,const Vec1<T> b)157 HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
158   using TU = MakeUnsigned<T>;
159   const Sisd<TU> du;
160   return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw &
161                                                      BitCast(du, b).raw)));
162 }
163 
164 // ------------------------------ Or
165 
166 template <typename T>
Or(const Vec1<T> a,const Vec1<T> b)167 HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
168   using TU = MakeUnsigned<T>;
169   const Sisd<TU> du;
170   return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
171 }
172 template <typename T>
173 HWY_API Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) {
174   return Or(a, b);
175 }
176 
177 // ------------------------------ Xor
178 
179 template <typename T>
Xor(const Vec1<T> a,const Vec1<T> b)180 HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
181   using TU = MakeUnsigned<T>;
182   const Sisd<TU> du;
183   return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
184 }
185 template <typename T>
186 HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
187   return Xor(a, b);
188 }
189 
190 // ------------------------------ OrAnd
191 
192 template <typename T>
OrAnd(const Vec1<T> o,const Vec1<T> a1,const Vec1<T> a2)193 HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
194   return Or(o, And(a1, a2));
195 }
196 
197 // ------------------------------ IfVecThenElse
198 
199 template <typename T>
IfVecThenElse(Vec1<T> mask,Vec1<T> yes,Vec1<T> no)200 HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) {
201   return IfThenElse(MaskFromVec(mask), yes, no);
202 }
203 
204 // ------------------------------ CopySign
205 
206 template <typename T>
CopySign(const Vec1<T> magn,const Vec1<T> sign)207 HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
208   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
209   const auto msb = SignBit(Sisd<T>());
210   return Or(AndNot(msb, magn), And(msb, sign));
211 }
212 
213 template <typename T>
CopySignToAbs(const Vec1<T> abs,const Vec1<T> sign)214 HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
215   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
216   return Or(abs, And(SignBit(Sisd<T>()), sign));
217 }
218 
219 // ------------------------------ BroadcastSignBit
220 
221 template <typename T>
BroadcastSignBit(const Vec1<T> v)222 HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) {
223   // This is used inside ShiftRight, so we cannot implement in terms of it.
224   return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
225 }
226 
227 // ------------------------------ PopulationCount
228 
229 #ifdef HWY_NATIVE_POPCNT
230 #undef HWY_NATIVE_POPCNT
231 #else
232 #define HWY_NATIVE_POPCNT
233 #endif
234 
235 template <typename T>
PopulationCount(Vec1<T> v)236 HWY_API Vec1<T> PopulationCount(Vec1<T> v) {
237   return Vec1<T>(static_cast<T>(PopCount(v.raw)));
238 }
239 
240 // ------------------------------ Mask
241 
242 template <typename TFrom, typename TTo>
RebindMask(Sisd<TTo>,Mask1<TFrom> m)243 HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
244   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
245   return Mask1<TTo>{m.bits};
246 }
247 
248 // v must be 0 or FF..FF.
249 template <typename T>
MaskFromVec(const Vec1<T> v)250 HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) {
251   Mask1<T> mask;
252   CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
253   return mask;
254 }
255 
256 template <typename T>
VecFromMask(const Mask1<T> mask)257 Vec1<T> VecFromMask(const Mask1<T> mask) {
258   Vec1<T> v;
259   CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
260   return v;
261 }
262 
263 template <typename T>
VecFromMask(Sisd<T>,const Mask1<T> mask)264 Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
265   Vec1<T> v;
266   CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
267   return v;
268 }
269 
270 template <typename T>
FirstN(Sisd<T>,size_t n)271 HWY_API Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
272   return Mask1<T>::FromBool(n != 0);
273 }
274 
275 // Returns mask ? yes : no.
276 template <typename T>
IfThenElse(const Mask1<T> mask,const Vec1<T> yes,const Vec1<T> no)277 HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
278                            const Vec1<T> no) {
279   return mask.bits ? yes : no;
280 }
281 
282 template <typename T>
IfThenElseZero(const Mask1<T> mask,const Vec1<T> yes)283 HWY_API Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) {
284   return mask.bits ? yes : Vec1<T>(0);
285 }
286 
287 template <typename T>
IfThenZeroElse(const Mask1<T> mask,const Vec1<T> no)288 HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
289   return mask.bits ? Vec1<T>(0) : no;
290 }
291 
292 template <typename T>
IfNegativeThenElse(Vec1<T> v,Vec1<T> yes,Vec1<T> no)293 HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
294   return v.raw < 0 ? yes : no;
295 }
296 
297 template <typename T>
ZeroIfNegative(const Vec1<T> v)298 HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) {
299   return v.raw < 0 ? Vec1<T>(0) : v;
300 }
301 
302 // ------------------------------ Mask logical
303 
304 template <typename T>
Not(const Mask1<T> m)305 HWY_API Mask1<T> Not(const Mask1<T> m) {
306   return MaskFromVec(Not(VecFromMask(Sisd<T>(), m)));
307 }
308 
309 template <typename T>
And(const Mask1<T> a,Mask1<T> b)310 HWY_API Mask1<T> And(const Mask1<T> a, Mask1<T> b) {
311   const Sisd<T> d;
312   return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
313 }
314 
315 template <typename T>
AndNot(const Mask1<T> a,Mask1<T> b)316 HWY_API Mask1<T> AndNot(const Mask1<T> a, Mask1<T> b) {
317   const Sisd<T> d;
318   return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
319 }
320 
321 template <typename T>
Or(const Mask1<T> a,Mask1<T> b)322 HWY_API Mask1<T> Or(const Mask1<T> a, Mask1<T> b) {
323   const Sisd<T> d;
324   return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
325 }
326 
327 template <typename T>
Xor(const Mask1<T> a,Mask1<T> b)328 HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
329   const Sisd<T> d;
330   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
331 }
332 
333 // ================================================== SHIFTS
334 
335 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
336 
337 template <int kBits, typename T>
ShiftLeft(const Vec1<T> v)338 HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) {
339   static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
340   return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits);
341 }
342 
343 template <int kBits, typename T>
ShiftRight(const Vec1<T> v)344 HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
345   static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
346 #if __cplusplus >= 202002L
347   // Signed right shift is now guaranteed to be arithmetic (rounding toward
348   // negative infinity, i.e. shifting in the sign bit).
349   return Vec1<T>(v.raw >> kBits);
350 #else
351   if (IsSigned<T>()) {
352     // Emulate arithmetic shift using only logical (unsigned) shifts, because
353     // signed shifts are still implementation-defined.
354     using TU = hwy::MakeUnsigned<T>;
355     const Sisd<TU> du;
356     const TU shifted = BitCast(du, v).raw >> kBits;
357     const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
358     const TU upper = sign << (sizeof(TU) * 8 - 1 - kBits);
359     return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
360   } else {
361     return Vec1<T>(v.raw >> kBits);  // unsigned, logical shift
362   }
363 #endif
364 }
365 
366 // ------------------------------ RotateRight (ShiftRight)
367 
368 template <int kBits, typename T>
RotateRight(const Vec1<T> v)369 HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
370   static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
371   if (kBits == 0) return v;
372   return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
373 }
374 
375 // ------------------------------ ShiftLeftSame (BroadcastSignBit)
376 
377 template <typename T>
ShiftLeftSame(const Vec1<T> v,int bits)378 HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
379   return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits);
380 }
381 
382 template <typename T>
ShiftRightSame(const Vec1<T> v,int bits)383 HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
384 #if __cplusplus >= 202002L
385   // Signed right shift is now guaranteed to be arithmetic (rounding toward
386   // negative infinity, i.e. shifting in the sign bit).
387   return Vec1<T>(v.raw >> bits);
388 #else
389   if (IsSigned<T>()) {
390     // Emulate arithmetic shift using only logical (unsigned) shifts, because
391     // signed shifts are still implementation-defined.
392     using TU = hwy::MakeUnsigned<T>;
393     const Sisd<TU> du;
394     const TU shifted = BitCast(du, v).raw >> bits;
395     const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
396     const TU upper = sign << (sizeof(TU) * 8 - 1 - bits);
397     return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
398   } else {
399     return Vec1<T>(v.raw >> bits);  // unsigned, logical shift
400   }
401 #endif
402 }
403 
404 // ------------------------------ Shl
405 
406 // Single-lane => same as ShiftLeftSame except for the argument type.
407 template <typename T>
408 HWY_API Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) {
409   return ShiftLeftSame(v, static_cast<int>(bits.raw));
410 }
411 
412 template <typename T>
413 HWY_API Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) {
414   return ShiftRightSame(v, static_cast<int>(bits.raw));
415 }
416 
417 // ================================================== ARITHMETIC
418 
419 template <typename T>
420 HWY_API Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
421   const uint64_t a64 = static_cast<uint64_t>(a.raw);
422   const uint64_t b64 = static_cast<uint64_t>(b.raw);
423   return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
424 }
425 HWY_API Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
426   return Vec1<float>(a.raw + b.raw);
427 }
428 HWY_API Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) {
429   return Vec1<double>(a.raw + b.raw);
430 }
431 
432 template <typename T>
433 HWY_API Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
434   const uint64_t a64 = static_cast<uint64_t>(a.raw);
435   const uint64_t b64 = static_cast<uint64_t>(b.raw);
436   return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
437 }
438 HWY_API Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
439   return Vec1<float>(a.raw - b.raw);
440 }
441 HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
442   return Vec1<double>(a.raw - b.raw);
443 }
444 
445 // ------------------------------ SumsOf8
446 
SumsOf8(const Vec1<uint8_t> v)447 HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) {
448   return Vec1<uint64_t>(v.raw);
449 }
450 
451 // ------------------------------ SaturatedAdd
452 
453 // Returns a + b clamped to the destination range.
454 
455 // Unsigned
SaturatedAdd(const Vec1<uint8_t> a,const Vec1<uint8_t> b)456 HWY_API Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
457                                    const Vec1<uint8_t> b) {
458   return Vec1<uint8_t>(
459       static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
460 }
SaturatedAdd(const Vec1<uint16_t> a,const Vec1<uint16_t> b)461 HWY_API Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
462                                     const Vec1<uint16_t> b) {
463   return Vec1<uint16_t>(
464       static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
465 }
466 
467 // Signed
SaturatedAdd(const Vec1<int8_t> a,const Vec1<int8_t> b)468 HWY_API Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a, const Vec1<int8_t> b) {
469   return Vec1<int8_t>(
470       static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
471 }
SaturatedAdd(const Vec1<int16_t> a,const Vec1<int16_t> b)472 HWY_API Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
473                                    const Vec1<int16_t> b) {
474   return Vec1<int16_t>(
475       static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
476 }
477 
478 // ------------------------------ Saturating subtraction
479 
480 // Returns a - b clamped to the destination range.
481 
482 // Unsigned
SaturatedSub(const Vec1<uint8_t> a,const Vec1<uint8_t> b)483 HWY_API Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
484                                    const Vec1<uint8_t> b) {
485   return Vec1<uint8_t>(
486       static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
487 }
SaturatedSub(const Vec1<uint16_t> a,const Vec1<uint16_t> b)488 HWY_API Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
489                                     const Vec1<uint16_t> b) {
490   return Vec1<uint16_t>(
491       static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
492 }
493 
494 // Signed
SaturatedSub(const Vec1<int8_t> a,const Vec1<int8_t> b)495 HWY_API Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a, const Vec1<int8_t> b) {
496   return Vec1<int8_t>(
497       static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
498 }
SaturatedSub(const Vec1<int16_t> a,const Vec1<int16_t> b)499 HWY_API Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
500                                    const Vec1<int16_t> b) {
501   return Vec1<int16_t>(
502       static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
503 }
504 
505 // ------------------------------ Average
506 
507 // Returns (a + b + 1) / 2
508 
AverageRound(const Vec1<uint8_t> a,const Vec1<uint8_t> b)509 HWY_API Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
510                                    const Vec1<uint8_t> b) {
511   return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
512 }
AverageRound(const Vec1<uint16_t> a,const Vec1<uint16_t> b)513 HWY_API Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
514                                     const Vec1<uint16_t> b) {
515   return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
516 }
517 
518 // ------------------------------ Absolute value
519 
520 template <typename T>
Abs(const Vec1<T> a)521 HWY_API Vec1<T> Abs(const Vec1<T> a) {
522   const T i = a.raw;
523   return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
524 }
Abs(const Vec1<float> a)525 HWY_API Vec1<float> Abs(const Vec1<float> a) {
526   return Vec1<float>(std::abs(a.raw));
527 }
Abs(const Vec1<double> a)528 HWY_API Vec1<double> Abs(const Vec1<double> a) {
529   return Vec1<double>(std::abs(a.raw));
530 }
531 
532 // ------------------------------ min/max
533 
534 template <typename T, HWY_IF_NOT_FLOAT(T)>
Min(const Vec1<T> a,const Vec1<T> b)535 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
536   return Vec1<T>(HWY_MIN(a.raw, b.raw));
537 }
538 
539 template <typename T, HWY_IF_FLOAT(T)>
Min(const Vec1<T> a,const Vec1<T> b)540 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
541   if (std::isnan(a.raw)) return b;
542   if (std::isnan(b.raw)) return a;
543   return Vec1<T>(HWY_MIN(a.raw, b.raw));
544 }
545 
546 template <typename T, HWY_IF_NOT_FLOAT(T)>
Max(const Vec1<T> a,const Vec1<T> b)547 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
548   return Vec1<T>(HWY_MAX(a.raw, b.raw));
549 }
550 
551 template <typename T, HWY_IF_FLOAT(T)>
Max(const Vec1<T> a,const Vec1<T> b)552 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
553   if (std::isnan(a.raw)) return b;
554   if (std::isnan(b.raw)) return a;
555   return Vec1<T>(HWY_MAX(a.raw, b.raw));
556 }
557 
558 // ------------------------------ Floating-point negate
559 
560 template <typename T, HWY_IF_FLOAT(T)>
Neg(const Vec1<T> v)561 HWY_API Vec1<T> Neg(const Vec1<T> v) {
562   return Xor(v, SignBit(Sisd<T>()));
563 }
564 
565 template <typename T, HWY_IF_NOT_FLOAT(T)>
Neg(const Vec1<T> v)566 HWY_API Vec1<T> Neg(const Vec1<T> v) {
567   return Zero(Sisd<T>()) - v;
568 }
569 
570 // ------------------------------ mul/div
571 
572 template <typename T, HWY_IF_FLOAT(T)>
573 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
574   return Vec1<T>(static_cast<T>(double(a.raw) * b.raw));
575 }
576 
577 template <typename T, HWY_IF_SIGNED(T)>
578 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
579   return Vec1<T>(static_cast<T>(int64_t(a.raw) * b.raw));
580 }
581 
582 template <typename T, HWY_IF_UNSIGNED(T)>
583 HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
584   return Vec1<T>(static_cast<T>(uint64_t(a.raw) * b.raw));
585 }
586 
587 template <typename T>
588 HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
589   return Vec1<T>(a.raw / b.raw);
590 }
591 
592 // Returns the upper 16 bits of a * b in each lane.
MulHigh(const Vec1<int16_t> a,const Vec1<int16_t> b)593 HWY_API Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
594   return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
595 }
MulHigh(const Vec1<uint16_t> a,const Vec1<uint16_t> b)596 HWY_API Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a, const Vec1<uint16_t> b) {
597   // Cast to uint32_t first to prevent overflow. Otherwise the result of
598   // uint16_t * uint16_t is in "int" which may overflow. In practice the result
599   // is the same but this way it is also defined.
600   return Vec1<uint16_t>(static_cast<uint16_t>(
601       (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
602 }
603 
604 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
MulEven(const Vec1<int32_t> a,const Vec1<int32_t> b)605 HWY_API Vec1<int64_t> MulEven(const Vec1<int32_t> a, const Vec1<int32_t> b) {
606   const int64_t a64 = a.raw;
607   return Vec1<int64_t>(a64 * b.raw);
608 }
MulEven(const Vec1<uint32_t> a,const Vec1<uint32_t> b)609 HWY_API Vec1<uint64_t> MulEven(const Vec1<uint32_t> a, const Vec1<uint32_t> b) {
610   const uint64_t a64 = a.raw;
611   return Vec1<uint64_t>(a64 * b.raw);
612 }
613 
614 // Approximate reciprocal
ApproximateReciprocal(const Vec1<float> v)615 HWY_API Vec1<float> ApproximateReciprocal(const Vec1<float> v) {
616   // Zero inputs are allowed, but callers are responsible for replacing the
617   // return value with something else (typically using IfThenElse). This check
618   // avoids a ubsan error. The return value is arbitrary.
619   if (v.raw == 0.0f) return Vec1<float>(0.0f);
620   return Vec1<float>(1.0f / v.raw);
621 }
622 
623 // Absolute value of difference.
AbsDiff(const Vec1<float> a,const Vec1<float> b)624 HWY_API Vec1<float> AbsDiff(const Vec1<float> a, const Vec1<float> b) {
625   return Abs(a - b);
626 }
627 
628 // ------------------------------ Floating-point multiply-add variants
629 
630 template <typename T>
MulAdd(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> add)631 HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
632   return mul * x + add;
633 }
634 
635 template <typename T>
NegMulAdd(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> add)636 HWY_API Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x,
637                           const Vec1<T> add) {
638   return add - mul * x;
639 }
640 
641 template <typename T>
MulSub(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> sub)642 HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
643   return mul * x - sub;
644 }
645 
646 template <typename T>
NegMulSub(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> sub)647 HWY_API Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x,
648                           const Vec1<T> sub) {
649   return Neg(mul) * x - sub;
650 }
651 
652 // ------------------------------ Floating-point square root
653 
654 // Approximate reciprocal square root
ApproximateReciprocalSqrt(const Vec1<float> v)655 HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
656   float f = v.raw;
657   const float half = f * 0.5f;
658   uint32_t bits;
659   CopyBytes<4>(&f, &bits);
660   // Initial guess based on log2(f)
661   bits = 0x5F3759DF - (bits >> 1);
662   CopyBytes<4>(&bits, &f);
663   // One Newton-Raphson iteration
664   return Vec1<float>(f * (1.5f - (half * f * f)));
665 }
666 
667 // Square root
Sqrt(const Vec1<float> v)668 HWY_API Vec1<float> Sqrt(const Vec1<float> v) {
669   return Vec1<float>(std::sqrt(v.raw));
670 }
Sqrt(const Vec1<double> v)671 HWY_API Vec1<double> Sqrt(const Vec1<double> v) {
672   return Vec1<double>(std::sqrt(v.raw));
673 }
674 
675 // ------------------------------ Floating-point rounding
676 
677 template <typename T>
Round(const Vec1<T> v)678 HWY_API Vec1<T> Round(const Vec1<T> v) {
679   using TI = MakeSigned<T>;
680   if (!(Abs(v).raw < MantissaEnd<T>())) {  // Huge or NaN
681     return v;
682   }
683   const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
684   const TI rounded = static_cast<TI>(v.raw + bias);
685   if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
686   // Round to even
687   if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
688     return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
689   }
690   return Vec1<T>(static_cast<T>(rounded));
691 }
692 
693 // Round-to-nearest even.
NearestInt(const Vec1<float> v)694 HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
695   using T = float;
696   using TI = int32_t;
697 
698   const T abs = Abs(v).raw;
699   const bool signbit = std::signbit(v.raw);
700 
701   if (!(abs < MantissaEnd<T>())) {  // Huge or NaN
702     // Check if too large to cast or NaN
703     if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
704       return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
705     }
706     return Vec1<int32_t>(static_cast<TI>(v.raw));
707   }
708   const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
709   const TI rounded = static_cast<TI>(v.raw + bias);
710   if (rounded == 0) return Vec1<int32_t>(0);
711   // Round to even
712   if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
713     return Vec1<TI>(rounded - (signbit ? -1 : 1));
714   }
715   return Vec1<TI>(rounded);
716 }
717 
718 template <typename T>
Trunc(const Vec1<T> v)719 HWY_API Vec1<T> Trunc(const Vec1<T> v) {
720   using TI = MakeSigned<T>;
721   if (!(Abs(v).raw <= MantissaEnd<T>())) {  // Huge or NaN
722     return v;
723   }
724   const TI truncated = static_cast<TI>(v.raw);
725   if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
726   return Vec1<T>(static_cast<T>(truncated));
727 }
728 
729 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
730           class V>
Ceiling(const V v)731 V Ceiling(const V v) {
732   const Bits kExponentMask = (1ull << kExponentBits) - 1;
733   const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
734   const Bits kBias = kExponentMask / 2;
735 
736   Float f = v.raw;
737   const bool positive = f > Float(0.0);
738 
739   Bits bits;
740   CopyBytes<sizeof(Bits)>(&v, &bits);
741 
742   const int exponent =
743       static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
744   // Already an integer.
745   if (exponent >= kMantissaBits) return v;
746   // |v| <= 1 => 0 or 1.
747   if (exponent < 0) return positive ? V(1) : V(-0.0);
748 
749   const Bits mantissa_mask = kMantissaMask >> exponent;
750   // Already an integer
751   if ((bits & mantissa_mask) == 0) return v;
752 
753   // Clear fractional bits and round up
754   if (positive) bits += (kMantissaMask + 1) >> exponent;
755   bits &= ~mantissa_mask;
756 
757   CopyBytes<sizeof(Bits)>(&bits, &f);
758   return V(f);
759 }
760 
761 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
762           class V>
Floor(const V v)763 V Floor(const V v) {
764   const Bits kExponentMask = (1ull << kExponentBits) - 1;
765   const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
766   const Bits kBias = kExponentMask / 2;
767 
768   Float f = v.raw;
769   const bool negative = f < Float(0.0);
770 
771   Bits bits;
772   CopyBytes<sizeof(Bits)>(&v, &bits);
773 
774   const int exponent =
775       static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
776   // Already an integer.
777   if (exponent >= kMantissaBits) return v;
778   // |v| <= 1 => -1 or 0.
779   if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
780 
781   const Bits mantissa_mask = kMantissaMask >> exponent;
782   // Already an integer
783   if ((bits & mantissa_mask) == 0) return v;
784 
785   // Clear fractional bits and round down
786   if (negative) bits += (kMantissaMask + 1) >> exponent;
787   bits &= ~mantissa_mask;
788 
789   CopyBytes<sizeof(Bits)>(&bits, &f);
790   return V(f);
791 }
792 
793 // Toward +infinity, aka ceiling
Ceil(const Vec1<float> v)794 HWY_API Vec1<float> Ceil(const Vec1<float> v) {
795   return Ceiling<float, uint32_t, 23, 8>(v);
796 }
Ceil(const Vec1<double> v)797 HWY_API Vec1<double> Ceil(const Vec1<double> v) {
798   return Ceiling<double, uint64_t, 52, 11>(v);
799 }
800 
801 // Toward -infinity, aka floor
Floor(const Vec1<float> v)802 HWY_API Vec1<float> Floor(const Vec1<float> v) {
803   return Floor<float, uint32_t, 23, 8>(v);
804 }
Floor(const Vec1<double> v)805 HWY_API Vec1<double> Floor(const Vec1<double> v) {
806   return Floor<double, uint64_t, 52, 11>(v);
807 }
808 
809 // ================================================== COMPARE
810 
811 template <typename T>
812 HWY_API Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) {
813   return Mask1<T>::FromBool(a.raw == b.raw);
814 }
815 
816 template <typename T>
817 HWY_API Mask1<T> operator!=(const Vec1<T> a, const Vec1<T> b) {
818   return Mask1<T>::FromBool(a.raw != b.raw);
819 }
820 
821 template <typename T>
TestBit(const Vec1<T> v,const Vec1<T> bit)822 HWY_API Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) {
823   static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
824   return (v & bit) == bit;
825 }
826 
827 template <typename T>
828 HWY_API Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) {
829   return Mask1<T>::FromBool(a.raw < b.raw);
830 }
831 template <typename T>
832 HWY_API Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) {
833   return Mask1<T>::FromBool(a.raw > b.raw);
834 }
835 
836 template <typename T>
837 HWY_API Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) {
838   return Mask1<T>::FromBool(a.raw <= b.raw);
839 }
840 template <typename T>
841 HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
842   return Mask1<T>::FromBool(a.raw >= b.raw);
843 }
844 
845 // ================================================== MEMORY
846 
847 // ------------------------------ Load
848 
849 template <typename T>
Load(Sisd<T>,const T * HWY_RESTRICT aligned)850 HWY_API Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
851   T t;
852   CopyBytes<sizeof(T)>(aligned, &t);
853   return Vec1<T>(t);
854 }
855 
856 template <typename T>
MaskedLoad(Mask1<T> m,Sisd<T> d,const T * HWY_RESTRICT aligned)857 HWY_API Vec1<T> MaskedLoad(Mask1<T> m, Sisd<T> d,
858                            const T* HWY_RESTRICT aligned) {
859   return IfThenElseZero(m, Load(d, aligned));
860 }
861 
862 template <typename T>
LoadU(Sisd<T> d,const T * HWY_RESTRICT p)863 HWY_API Vec1<T> LoadU(Sisd<T> d, const T* HWY_RESTRICT p) {
864   return Load(d, p);
865 }
866 
867 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
868 template <typename T>
LoadDup128(Sisd<T> d,const T * HWY_RESTRICT aligned)869 HWY_API Vec1<T> LoadDup128(Sisd<T> d, const T* HWY_RESTRICT aligned) {
870   return Load(d, aligned);
871 }
872 
873 // ------------------------------ Store
874 
875 template <typename T>
Store(const Vec1<T> v,Sisd<T>,T * HWY_RESTRICT aligned)876 HWY_API void Store(const Vec1<T> v, Sisd<T> /* tag */,
877                    T* HWY_RESTRICT aligned) {
878   CopyBytes<sizeof(T)>(&v.raw, aligned);
879 }
880 
881 template <typename T>
StoreU(const Vec1<T> v,Sisd<T> d,T * HWY_RESTRICT p)882 HWY_API void StoreU(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT p) {
883   return Store(v, d, p);
884 }
885 
886 // ------------------------------ StoreInterleaved3
887 
StoreInterleaved3(const Vec1<uint8_t> v0,const Vec1<uint8_t> v1,const Vec1<uint8_t> v2,Sisd<uint8_t> d,uint8_t * HWY_RESTRICT unaligned)888 HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
889                                const Vec1<uint8_t> v2, Sisd<uint8_t> d,
890                                uint8_t* HWY_RESTRICT unaligned) {
891   StoreU(v0, d, unaligned + 0);
892   StoreU(v1, d, unaligned + 1);
893   StoreU(v2, d, unaligned + 2);
894 }
895 
StoreInterleaved4(const Vec1<uint8_t> v0,const Vec1<uint8_t> v1,const Vec1<uint8_t> v2,const Vec1<uint8_t> v3,Sisd<uint8_t> d,uint8_t * HWY_RESTRICT unaligned)896 HWY_API void StoreInterleaved4(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
897                                const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
898                                Sisd<uint8_t> d,
899                                uint8_t* HWY_RESTRICT unaligned) {
900   StoreU(v0, d, unaligned + 0);
901   StoreU(v1, d, unaligned + 1);
902   StoreU(v2, d, unaligned + 2);
903   StoreU(v3, d, unaligned + 3);
904 }
905 
906 // ------------------------------ Stream
907 
908 template <typename T>
Stream(const Vec1<T> v,Sisd<T> d,T * HWY_RESTRICT aligned)909 HWY_API void Stream(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT aligned) {
910   return Store(v, d, aligned);
911 }
912 
913 // ------------------------------ Scatter
914 
915 template <typename T, typename Offset>
ScatterOffset(Vec1<T> v,Sisd<T> d,T * base,const Vec1<Offset> offset)916 HWY_API void ScatterOffset(Vec1<T> v, Sisd<T> d, T* base,
917                            const Vec1<Offset> offset) {
918   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
919   uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
920   return Store(v, d, reinterpret_cast<T*>(base8));
921 }
922 
923 template <typename T, typename Index>
ScatterIndex(Vec1<T> v,Sisd<T> d,T * HWY_RESTRICT base,const Vec1<Index> index)924 HWY_API void ScatterIndex(Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT base,
925                           const Vec1<Index> index) {
926   static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
927   return Store(v, d, base + index.raw);
928 }
929 
930 // ------------------------------ Gather
931 
932 template <typename T, typename Offset>
GatherOffset(Sisd<T> d,const T * base,const Vec1<Offset> offset)933 HWY_API Vec1<T> GatherOffset(Sisd<T> d, const T* base,
934                              const Vec1<Offset> offset) {
935   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
936   const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
937   return Load(d, reinterpret_cast<const T*>(addr));
938 }
939 
940 template <typename T, typename Index>
GatherIndex(Sisd<T> d,const T * HWY_RESTRICT base,const Vec1<Index> index)941 HWY_API Vec1<T> GatherIndex(Sisd<T> d, const T* HWY_RESTRICT base,
942                             const Vec1<Index> index) {
943   static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
944   return Load(d, base + index.raw);
945 }
946 
947 // ================================================== CONVERT
948 
949 // ConvertTo and DemoteTo with floating-point input and integer output truncate
950 // (rounding toward zero).
951 
952 template <typename FromT, typename ToT>
PromoteTo(Sisd<ToT>,Vec1<FromT> from)953 HWY_API Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
954   static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
955   // For bits Y > X, floatX->floatY and intX->intY are always representable.
956   return Vec1<ToT>(static_cast<ToT>(from.raw));
957 }
958 
959 // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
960 // so we overload for FromT=double and ToT={float,int32_t}.
DemoteTo(Sisd<float>,Vec1<double> from)961 HWY_API Vec1<float> DemoteTo(Sisd<float> /* tag */, Vec1<double> from) {
962   // Prevent ubsan errors when converting float to narrower integer/float
963   if (std::isinf(from.raw) ||
964       std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
965     return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
966                                               : HighestValue<float>());
967   }
968   return Vec1<float>(static_cast<float>(from.raw));
969 }
DemoteTo(Sisd<int32_t>,Vec1<double> from)970 HWY_API Vec1<int32_t> DemoteTo(Sisd<int32_t> /* tag */, Vec1<double> from) {
971   // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
972   if (std::isinf(from.raw) ||
973       std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
974     return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
975                                                 : HighestValue<int32_t>());
976   }
977   return Vec1<int32_t>(static_cast<int32_t>(from.raw));
978 }
979 
980 template <typename FromT, typename ToT>
DemoteTo(Sisd<ToT>,Vec1<FromT> from)981 HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
982   static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
983   static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
984 
985   // Int to int: choose closest value in ToT to `from` (avoids UB)
986   from.raw = HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw), LimitsMax<ToT>());
987   return Vec1<ToT>(static_cast<ToT>(from.raw));
988 }
989 
PromoteTo(Sisd<float>,const Vec1<float16_t> v)990 HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
991 #if HWY_NATIVE_FLOAT16
992   uint16_t bits16;
993   CopyBytes<2>(&v.raw, &bits16);
994 #else
995   const uint16_t bits16 = v.raw.bits;
996 #endif
997   const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
998   const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
999   const uint32_t mantissa = bits16 & 0x3FF;
1000 
1001   // Subnormal or zero
1002   if (biased_exp == 0) {
1003     const float subnormal =
1004         (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
1005     return Vec1<float>(sign ? -subnormal : subnormal);
1006   }
1007 
1008   // Normalized: convert the representation directly (faster than ldexp/tables).
1009   const uint32_t biased_exp32 = biased_exp + (127 - 15);
1010   const uint32_t mantissa32 = mantissa << (23 - 10);
1011   const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1012   float out;
1013   CopyBytes<4>(&bits32, &out);
1014   return Vec1<float>(out);
1015 }
1016 
PromoteTo(Sisd<float> d,const Vec1<bfloat16_t> v)1017 HWY_API Vec1<float> PromoteTo(Sisd<float> d, const Vec1<bfloat16_t> v) {
1018   return Set(d, F32FromBF16(v.raw));
1019 }
1020 
DemoteTo(Sisd<float16_t>,const Vec1<float> v)1021 HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
1022                                  const Vec1<float> v) {
1023   uint32_t bits32;
1024   CopyBytes<4>(&v.raw, &bits32);
1025   const uint32_t sign = bits32 >> 31;
1026   const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1027   const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1028 
1029   const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
1030 
1031   // Tiny or zero => zero.
1032   Vec1<float16_t> out;
1033   if (exp < -24) {
1034 #if HWY_NATIVE_FLOAT16
1035     const uint16_t zero = 0;
1036     CopyBytes<2>(&zero, &out.raw);
1037 #else
1038     out.raw.bits = 0;
1039 #endif
1040     return out;
1041   }
1042 
1043   uint32_t biased_exp16, mantissa16;
1044 
1045   // exp = [-24, -15] => subnormal
1046   if (exp < -14) {
1047     biased_exp16 = 0;
1048     const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
1049     HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
1050     mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
1051                                        (mantissa32 >> (13 + sub_exp)));
1052   } else {
1053     // exp = [-14, 15]
1054     biased_exp16 = static_cast<uint32_t>(exp + 15);
1055     HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1056     mantissa16 = mantissa32 >> 13;
1057   }
1058 
1059   HWY_DASSERT(mantissa16 < 1024);
1060   const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1061   HWY_DASSERT(bits16 < 0x10000);
1062 #if HWY_NATIVE_FLOAT16
1063   const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
1064   CopyBytes<2>(&narrowed, &out.raw);
1065 #else
1066   out.raw.bits = static_cast<uint16_t>(bits16);
1067 #endif
1068   return out;
1069 }
1070 
DemoteTo(Sisd<bfloat16_t> d,const Vec1<float> v)1071 HWY_API Vec1<bfloat16_t> DemoteTo(Sisd<bfloat16_t> d, const Vec1<float> v) {
1072   return Set(d, BF16FromF32(v.raw));
1073 }
1074 
1075 template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
ConvertTo(Sisd<ToT>,Vec1<FromT> from)1076 HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
1077   static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1078   // float## -> int##: return closest representable value. We cannot exactly
1079   // represent LimitsMax<ToT> in FromT, so use double.
1080   const double f = static_cast<double>(from.raw);
1081   if (std::isinf(from.raw) ||
1082       std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
1083     return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
1084                                             : LimitsMax<ToT>());
1085   }
1086   return Vec1<ToT>(static_cast<ToT>(from.raw));
1087 }
1088 
1089 template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
ConvertTo(Sisd<ToT>,Vec1<FromT> from)1090 HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
1091   static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1092   // int## -> float##: no check needed
1093   return Vec1<ToT>(static_cast<ToT>(from.raw));
1094 }
1095 
U8FromU32(const Vec1<uint32_t> v)1096 HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
1097   return DemoteTo(Sisd<uint8_t>(), v);
1098 }
1099 
1100 // ================================================== COMBINE
1101 // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
1102 
1103 template <typename T>
LowerHalf(Vec1<T> v)1104 HWY_API Vec1<T> LowerHalf(Vec1<T> v) {
1105   return v;
1106 }
1107 
1108 template <typename T>
LowerHalf(Sisd<T>,Vec1<T> v)1109 HWY_API Vec1<T> LowerHalf(Sisd<T> /* tag */, Vec1<T> v) {
1110   return v;
1111 }
1112 
1113 // ================================================== SWIZZLE
1114 
1115 template <typename T>
GetLane(const Vec1<T> v)1116 HWY_API T GetLane(const Vec1<T> v) {
1117   return v.raw;
1118 }
1119 
1120 template <typename T>
DupEven(Vec1<T> v)1121 HWY_API Vec1<T> DupEven(Vec1<T> v) {
1122   return v;
1123 }
1124 // DupOdd is unsupported.
1125 
1126 template <typename T>
OddEven(Vec1<T>,Vec1<T> even)1127 HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) {
1128   return even;
1129 }
1130 
1131 template <typename T>
OddEvenBlocks(Vec1<T>,Vec1<T> even)1132 HWY_API Vec1<T> OddEvenBlocks(Vec1<T> /* odd */, Vec1<T> even) {
1133   return even;
1134 }
1135 
1136 // ------------------------------ SwapAdjacentBlocks
1137 
1138 template <typename T>
SwapAdjacentBlocks(Vec1<T> v)1139 HWY_API Vec1<T> SwapAdjacentBlocks(Vec1<T> v) {
1140   return v;
1141 }
1142 
1143 // ------------------------------ TableLookupLanes
1144 
1145 // Returned by SetTableIndices for use by TableLookupLanes.
1146 template <typename T>
1147 struct Indices1 {
1148   MakeSigned<T> raw;
1149 };
1150 
1151 template <typename T, typename TI>
IndicesFromVec(Sisd<T>,Vec1<TI> vec)1152 HWY_API Indices1<T> IndicesFromVec(Sisd<T>, Vec1<TI> vec) {
1153   static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
1154   HWY_DASSERT(vec.raw == 0);
1155   return Indices1<T>{vec.raw};
1156 }
1157 
1158 template <typename T, typename TI>
SetTableIndices(Sisd<T> d,const TI * idx)1159 HWY_API Indices1<T> SetTableIndices(Sisd<T> d, const TI* idx) {
1160   return IndicesFromVec(d, LoadU(idx));
1161 }
1162 
1163 template <typename T>
TableLookupLanes(const Vec1<T> v,const Indices1<T>)1164 HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) {
1165   return v;
1166 }
1167 
1168 // ------------------------------ ReverseBlocks
1169 
1170 // Single block: no change
1171 template <typename T>
ReverseBlocks(Sisd<T>,const Vec1<T> v)1172 HWY_API Vec1<T> ReverseBlocks(Sisd<T> /* tag */, const Vec1<T> v) {
1173   return v;
1174 }
1175 
1176 // ------------------------------ Reverse
1177 
1178 template <typename T>
Reverse(Sisd<T>,const Vec1<T> v)1179 HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
1180   return v;
1181 }
1182 
1183 template <typename T>
Reverse2(Sisd<T>,const Vec1<T> v)1184 HWY_API Vec1<T> Reverse2(Sisd<T> /* tag */, const Vec1<T> v) {
1185   return v;
1186 }
1187 
1188 template <typename T>
Reverse4(Sisd<T>,const Vec1<T> v)1189 HWY_API Vec1<T> Reverse4(Sisd<T> /* tag */, const Vec1<T> v) {
1190   return v;
1191 }
1192 
1193 template <typename T>
Reverse8(Sisd<T>,const Vec1<T> v)1194 HWY_API Vec1<T> Reverse8(Sisd<T> /* tag */, const Vec1<T> v) {
1195   return v;
1196 }
1197 
1198 // ================================================== BLOCKWISE
1199 // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
1200 
1201 // ------------------------------ Broadcast/splat any lane
1202 
1203 template <int kLane, typename T>
Broadcast(const Vec1<T> v)1204 HWY_API Vec1<T> Broadcast(const Vec1<T> v) {
1205   static_assert(kLane == 0, "Scalar only has one lane");
1206   return v;
1207 }
1208 
1209 // ------------------------------ TableLookupBytes, TableLookupBytesOr0
1210 
1211 template <typename T, typename TI>
TableLookupBytes(const Vec1<T> in,const Vec1<TI> indices)1212 HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) {
1213   uint8_t in_bytes[sizeof(T)];
1214   uint8_t idx_bytes[sizeof(T)];
1215   uint8_t out_bytes[sizeof(T)];
1216   CopyBytes<sizeof(T)>(&in, &in_bytes);
1217   CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1218   for (size_t i = 0; i < sizeof(T); ++i) {
1219     out_bytes[i] = in_bytes[idx_bytes[i]];
1220   }
1221   TI out;
1222   CopyBytes<sizeof(TI)>(&out_bytes, &out);
1223   return Vec1<TI>{out};
1224 }
1225 
1226 template <typename T, typename TI>
TableLookupBytesOr0(const Vec1<T> in,const Vec1<TI> indices)1227 HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) {
1228   uint8_t in_bytes[sizeof(T)];
1229   uint8_t idx_bytes[sizeof(T)];
1230   uint8_t out_bytes[sizeof(T)];
1231   CopyBytes<sizeof(T)>(&in, &in_bytes);
1232   CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1233   for (size_t i = 0; i < sizeof(T); ++i) {
1234     out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1235   }
1236   TI out;
1237   CopyBytes<sizeof(TI)>(&out_bytes, &out);
1238   return Vec1<TI>{out};
1239 }
1240 
1241 // ------------------------------ ZipLower
1242 
ZipLower(const Vec1<uint8_t> a,const Vec1<uint8_t> b)1243 HWY_API Vec1<uint16_t> ZipLower(const Vec1<uint8_t> a, const Vec1<uint8_t> b) {
1244   return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t(b.raw) << 8) + a.raw));
1245 }
ZipLower(const Vec1<uint16_t> a,const Vec1<uint16_t> b)1246 HWY_API Vec1<uint32_t> ZipLower(const Vec1<uint16_t> a,
1247                                 const Vec1<uint16_t> b) {
1248   return Vec1<uint32_t>((uint32_t(b.raw) << 16) + a.raw);
1249 }
ZipLower(const Vec1<uint32_t> a,const Vec1<uint32_t> b)1250 HWY_API Vec1<uint64_t> ZipLower(const Vec1<uint32_t> a,
1251                                 const Vec1<uint32_t> b) {
1252   return Vec1<uint64_t>((uint64_t(b.raw) << 32) + a.raw);
1253 }
ZipLower(const Vec1<int8_t> a,const Vec1<int8_t> b)1254 HWY_API Vec1<int16_t> ZipLower(const Vec1<int8_t> a, const Vec1<int8_t> b) {
1255   return Vec1<int16_t>(static_cast<int16_t>((int32_t(b.raw) << 8) + a.raw));
1256 }
ZipLower(const Vec1<int16_t> a,const Vec1<int16_t> b)1257 HWY_API Vec1<int32_t> ZipLower(const Vec1<int16_t> a, const Vec1<int16_t> b) {
1258   return Vec1<int32_t>((int32_t(b.raw) << 16) + a.raw);
1259 }
ZipLower(const Vec1<int32_t> a,const Vec1<int32_t> b)1260 HWY_API Vec1<int64_t> ZipLower(const Vec1<int32_t> a, const Vec1<int32_t> b) {
1261   return Vec1<int64_t>((int64_t(b.raw) << 32) + a.raw);
1262 }
1263 
1264 template <typename T, typename TW = MakeWide<T>, class VW = Vec1<TW>>
ZipLower(Sisd<TW>,Vec1<T> a,Vec1<T> b)1265 HWY_API VW ZipLower(Sisd<TW> /* tag */, Vec1<T> a, Vec1<T> b) {
1266   return VW(static_cast<TW>((TW{b.raw} << (sizeof(T) * 8)) + a.raw));
1267 }
1268 
1269 // ================================================== MASK
1270 
1271 template <typename T>
AllFalse(Sisd<T>,const Mask1<T> mask)1272 HWY_API bool AllFalse(Sisd<T> /* tag */, const Mask1<T> mask) {
1273   return mask.bits == 0;
1274 }
1275 
1276 template <typename T>
AllTrue(Sisd<T>,const Mask1<T> mask)1277 HWY_API bool AllTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1278   return mask.bits != 0;
1279 }
1280 
1281 // `p` points to at least 8 readable bytes, not all of which need be valid.
1282 template <typename T>
LoadMaskBits(Sisd<T>,const uint8_t * HWY_RESTRICT bits)1283 HWY_API Mask1<T> LoadMaskBits(Sisd<T> /* tag */,
1284                               const uint8_t* HWY_RESTRICT bits) {
1285   return Mask1<T>::FromBool((bits[0] & 1) != 0);
1286 }
1287 
1288 // `p` points to at least 8 writable bytes.
1289 template <typename T>
StoreMaskBits(Sisd<T> d,const Mask1<T> mask,uint8_t * bits)1290 HWY_API size_t StoreMaskBits(Sisd<T> d, const Mask1<T> mask, uint8_t* bits) {
1291   *bits = AllTrue(d, mask);
1292   return 1;
1293 }
1294 
1295 template <typename T>
CountTrue(Sisd<T>,const Mask1<T> mask)1296 HWY_API size_t CountTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1297   return mask.bits == 0 ? 0 : 1;
1298 }
1299 
1300 template <typename T>
FindFirstTrue(Sisd<T>,const Mask1<T> mask)1301 HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
1302   return mask.bits == 0 ? -1 : 0;
1303 }
1304 
1305 // ------------------------------ Compress, CompressBits
1306 
1307 template <typename T>
Compress(Vec1<T> v,const Mask1<T>)1308 HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
1309   // Upper lanes are undefined, so result is the same independent of mask.
1310   return v;
1311 }
1312 
1313 template <typename T>
Compress(Vec1<T> v,const uint8_t * HWY_RESTRICT)1314 HWY_API Vec1<T> Compress(Vec1<T> v, const uint8_t* HWY_RESTRICT /* bits */) {
1315   return v;
1316 }
1317 
1318 // ------------------------------ CompressStore
1319 
1320 template <typename T>
CompressStore(Vec1<T> v,const Mask1<T> mask,Sisd<T> d,T * HWY_RESTRICT unaligned)1321 HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
1322                              T* HWY_RESTRICT unaligned) {
1323   StoreU(Compress(v, mask), d, unaligned);
1324   return CountTrue(d, mask);
1325 }
1326 
1327 // ------------------------------ CompressBlendedStore
1328 
1329 template <typename T>
CompressBlendedStore(Vec1<T> v,const Mask1<T> mask,Sisd<T> d,T * HWY_RESTRICT unaligned)1330 HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
1331                                     T* HWY_RESTRICT unaligned) {
1332   if (!mask.bits) return 0;
1333   StoreU(v, d, unaligned);
1334   return 1;
1335 }
1336 
1337 // ------------------------------ CompressBitsStore
1338 
1339 template <typename T>
CompressBitsStore(Vec1<T> v,const uint8_t * HWY_RESTRICT bits,Sisd<T> d,T * HWY_RESTRICT unaligned)1340 HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
1341                                  Sisd<T> d, T* HWY_RESTRICT unaligned) {
1342   const Mask1<T> mask = LoadMaskBits(d, bits);
1343   StoreU(Compress(v, mask), d, unaligned);
1344   return CountTrue(d, mask);
1345 }
1346 
1347 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
1348 
ReorderWidenMulAccumulate(Sisd<float>,Vec1<bfloat16_t> a,Vec1<bfloat16_t> b,const Vec1<float> sum0,Vec1<float> &)1349 HWY_API Vec1<float> ReorderWidenMulAccumulate(Sisd<float> /* tag */,
1350                                               Vec1<bfloat16_t> a,
1351                                               Vec1<bfloat16_t> b,
1352                                               const Vec1<float> sum0,
1353                                               Vec1<float>& /* sum1 */) {
1354   return MulAdd(Vec1<float>(F32FromBF16(a.raw)),
1355                 Vec1<float>(F32FromBF16(b.raw)), sum0);
1356 }
1357 
1358 // ================================================== REDUCTIONS
1359 
1360 // Sum of all lanes, i.e. the only one.
1361 template <typename T>
SumOfLanes(Sisd<T>,const Vec1<T> v)1362 HWY_API Vec1<T> SumOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
1363   return v;
1364 }
1365 template <typename T>
MinOfLanes(Sisd<T>,const Vec1<T> v)1366 HWY_API Vec1<T> MinOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
1367   return v;
1368 }
1369 template <typename T>
MaxOfLanes(Sisd<T>,const Vec1<T> v)1370 HWY_API Vec1<T> MaxOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
1371   return v;
1372 }
1373 
1374 // ================================================== Operator wrapper
1375 
1376 template <class V>
Add(V a,V b)1377 HWY_API V Add(V a, V b) {
1378   return a + b;
1379 }
1380 template <class V>
Sub(V a,V b)1381 HWY_API V Sub(V a, V b) {
1382   return a - b;
1383 }
1384 
1385 template <class V>
Mul(V a,V b)1386 HWY_API V Mul(V a, V b) {
1387   return a * b;
1388 }
1389 template <class V>
Div(V a,V b)1390 HWY_API V Div(V a, V b) {
1391   return a / b;
1392 }
1393 
1394 template <class V>
Shl(V a,V b)1395 V Shl(V a, V b) {
1396   return a << b;
1397 }
1398 template <class V>
Shr(V a,V b)1399 V Shr(V a, V b) {
1400   return a >> b;
1401 }
1402 
1403 template <class V>
1404 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1405   return a == b;
1406 }
1407 template <class V>
1408 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
1409   return a != b;
1410 }
1411 template <class V>
1412 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1413   return a < b;
1414 }
1415 
1416 template <class V>
1417 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1418   return a > b;
1419 }
1420 template <class V>
1421 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1422   return a >= b;
1423 }
1424 
1425 template <class V>
1426 HWY_API auto Le(V a, V b) -> decltype(a == b) {
1427   return a <= b;
1428 }
1429 
1430 // NOLINTNEXTLINE(google-readability-namespace-comments)
1431 }  // namespace HWY_NAMESPACE
1432 }  // namespace hwy
1433 HWY_AFTER_NAMESPACE();
1434