1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Single-element vectors and operations.
16 // External include guard in highway.h - see comment there.
17 
18 #include <stddef.h>
19 #include <stdint.h>
20 
21 #include <algorithm>  // std::min
22 
23 #include "hwy/base.h"
24 #include "hwy/ops/shared-inl.h"
25 
26 HWY_BEFORE_NAMESPACE();
27 namespace hwy {
28 namespace HWY_NAMESPACE {
29 
30 // Single instruction, single data.
31 template <typename T>
32 using Sisd = Simd<T, 1>;
33 
34 // (Wrapper class required for overloading comparison operators.)
35 template <typename T>
36 struct Vec1 {
37   HWY_INLINE Vec1() = default;
38   Vec1(const Vec1&) = default;
39   Vec1& operator=(const Vec1&) = default;
Vec1Vec140   HWY_INLINE explicit Vec1(const T t) : raw(t) {}
41 
42   HWY_INLINE Vec1& operator*=(const Vec1 other) {
43     return *this = (*this * other);
44   }
45   HWY_INLINE Vec1& operator/=(const Vec1 other) {
46     return *this = (*this / other);
47   }
48   HWY_INLINE Vec1& operator+=(const Vec1 other) {
49     return *this = (*this + other);
50   }
51   HWY_INLINE Vec1& operator-=(const Vec1 other) {
52     return *this = (*this - other);
53   }
54   HWY_INLINE Vec1& operator&=(const Vec1 other) {
55     return *this = (*this & other);
56   }
57   HWY_INLINE Vec1& operator|=(const Vec1 other) {
58     return *this = (*this | other);
59   }
60   HWY_INLINE Vec1& operator^=(const Vec1 other) {
61     return *this = (*this ^ other);
62   }
63 
64   T raw;
65 };
66 
67 // 0 or FF..FF, same size as Vec1.
68 template <typename T>
69 class Mask1 {
70   using Raw = hwy::MakeUnsigned<T>;
71 
72  public:
FromBool(bool b)73   static HWY_INLINE Mask1<T> FromBool(bool b) {
74     Mask1<T> mask;
75     mask.bits = b ? ~Raw(0) : 0;
76     return mask;
77   }
78 
79   Raw bits;
80 };
81 
82 // ------------------------------ BitCast
83 
84 template <typename T, typename FromT>
BitCast(Sisd<T>,Vec1<FromT> v)85 HWY_INLINE Vec1<T> BitCast(Sisd<T> /* tag */, Vec1<FromT> v) {
86   static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
87   T to;
88   CopyBytes<sizeof(FromT)>(&v.raw, &to);
89   return Vec1<T>(to);
90 }
91 
92 // ------------------------------ Set
93 
94 template <typename T>
Zero(Sisd<T>)95 HWY_INLINE Vec1<T> Zero(Sisd<T> /* tag */) {
96   return Vec1<T>(T(0));
97 }
98 
99 template <typename T, typename T2>
Set(Sisd<T>,const T2 t)100 HWY_INLINE Vec1<T> Set(Sisd<T> /* tag */, const T2 t) {
101   return Vec1<T>(static_cast<T>(t));
102 }
103 
104 template <typename T>
Undefined(Sisd<T> d)105 HWY_INLINE Vec1<T> Undefined(Sisd<T> d) {
106   return Zero(d);
107 }
108 
109 template <typename T, typename T2>
Iota(const Sisd<T>,const T2 first)110 Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
111   return Vec1<T>(static_cast<T>(first));
112 }
113 
114 // ================================================== LOGICAL
115 
116 // ------------------------------ Not
117 
118 template <typename T>
Not(const Vec1<T> v)119 HWY_INLINE Vec1<T> Not(const Vec1<T> v) {
120   using TU = MakeUnsigned<T>;
121   const Sisd<TU> du;
122   return BitCast(Sisd<T>(), Vec1<TU>(~BitCast(du, v).raw));
123 }
124 
125 // ------------------------------ And
126 
127 template <typename T>
And(const Vec1<T> a,const Vec1<T> b)128 HWY_INLINE Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
129   using TU = MakeUnsigned<T>;
130   const Sisd<TU> du;
131   return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
132 }
133 template <typename T>
134 HWY_INLINE Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) {
135   return And(a, b);
136 }
137 
138 // ------------------------------ AndNot
139 
140 template <typename T>
AndNot(const Vec1<T> a,const Vec1<T> b)141 HWY_INLINE Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
142   using TU = MakeUnsigned<T>;
143   const Sisd<TU> du;
144   return BitCast(Sisd<T>(), Vec1<TU>(~BitCast(du, a).raw & BitCast(du, b).raw));
145 }
146 
147 // ------------------------------ Or
148 
149 template <typename T>
Or(const Vec1<T> a,const Vec1<T> b)150 HWY_INLINE Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
151   using TU = MakeUnsigned<T>;
152   const Sisd<TU> du;
153   return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
154 }
155 template <typename T>
156 HWY_INLINE Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) {
157   return Or(a, b);
158 }
159 
160 // ------------------------------ Xor
161 
162 template <typename T>
Xor(const Vec1<T> a,const Vec1<T> b)163 HWY_INLINE Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
164   using TU = MakeUnsigned<T>;
165   const Sisd<TU> du;
166   return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
167 }
168 template <typename T>
169 HWY_INLINE Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
170   return Xor(a, b);
171 }
172 
173 // ------------------------------ CopySign
174 
175 template <typename T>
CopySign(const Vec1<T> magn,const Vec1<T> sign)176 HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
177   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
178   const auto msb = SignBit(Sisd<T>());
179   return Or(AndNot(msb, magn), And(msb, sign));
180 }
181 
182 template <typename T>
CopySignToAbs(const Vec1<T> abs,const Vec1<T> sign)183 HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
184   static_assert(IsFloat<T>(), "Only makes sense for floating-point");
185   return Or(abs, And(SignBit(Sisd<T>()), sign));
186 }
187 
188 // ------------------------------ BroadcastSignBit
189 
190 template <typename T>
BroadcastSignBit(const Vec1<T> v)191 HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) {
192   // This is used inside ShiftRight, so we cannot implement in terms of it.
193   return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
194 }
195 
196 // ------------------------------ Mask
197 
198 template <typename TFrom, typename TTo>
RebindMask(Sisd<TTo>,Mask1<TFrom> m)199 HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
200   static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
201   return Mask1<TTo>{m.bits};
202 }
203 
204 // v must be 0 or FF..FF.
205 template <typename T>
MaskFromVec(const Vec1<T> v)206 HWY_INLINE Mask1<T> MaskFromVec(const Vec1<T> v) {
207   Mask1<T> mask;
208   CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
209   return mask;
210 }
211 
212 template <typename T>
VecFromMask(const Mask1<T> mask)213 Vec1<T> VecFromMask(const Mask1<T> mask) {
214   Vec1<T> v;
215   CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
216   return v;
217 }
218 
219 template <typename T>
VecFromMask(Sisd<T>,const Mask1<T> mask)220 Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
221   Vec1<T> v;
222   CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
223   return v;
224 }
225 
226 template <typename T>
FirstN(Sisd<T>,size_t n)227 HWY_INLINE Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
228   return Mask1<T>::FromBool(n != 0);
229 }
230 
231 // Returns mask ? yes : no.
232 template <typename T>
IfThenElse(const Mask1<T> mask,const Vec1<T> yes,const Vec1<T> no)233 HWY_INLINE Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
234                               const Vec1<T> no) {
235   return mask.bits ? yes : no;
236 }
237 
238 template <typename T>
IfThenElseZero(const Mask1<T> mask,const Vec1<T> yes)239 HWY_INLINE Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) {
240   return mask.bits ? yes : Vec1<T>(0);
241 }
242 
243 template <typename T>
IfThenZeroElse(const Mask1<T> mask,const Vec1<T> no)244 HWY_INLINE Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
245   return mask.bits ? Vec1<T>(0) : no;
246 }
247 
248 template <typename T>
ZeroIfNegative(const Vec1<T> v)249 HWY_INLINE Vec1<T> ZeroIfNegative(const Vec1<T> v) {
250   return v.raw < 0 ? Vec1<T>(0) : v;
251 }
252 
253 // ------------------------------ Mask logical
254 
255 template <typename T>
Not(const Mask1<T> m)256 HWY_API Mask1<T> Not(const Mask1<T> m) {
257   const Sisd<T> d;
258   return MaskFromVec(Not(VecFromMask(d, m)));
259 }
260 
261 template <typename T>
And(const Mask1<T> a,Mask1<T> b)262 HWY_API Mask1<T> And(const Mask1<T> a, Mask1<T> b) {
263   const Sisd<T> d;
264   return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
265 }
266 
267 template <typename T>
AndNot(const Mask1<T> a,Mask1<T> b)268 HWY_API Mask1<T> AndNot(const Mask1<T> a, Mask1<T> b) {
269   const Sisd<T> d;
270   return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
271 }
272 
273 template <typename T>
Or(const Mask1<T> a,Mask1<T> b)274 HWY_API Mask1<T> Or(const Mask1<T> a, Mask1<T> b) {
275   const Sisd<T> d;
276   return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
277 }
278 
279 template <typename T>
Xor(const Mask1<T> a,Mask1<T> b)280 HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
281   const Sisd<T> d;
282   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
283 }
284 
285 // ================================================== SHIFTS
286 
287 // ------------------------------ ShiftLeft (BroadcastSignBit)
288 
289 template <int kBits, typename T>
ShiftLeft(const Vec1<T> v)290 HWY_INLINE Vec1<T> ShiftLeft(const Vec1<T> v) {
291   static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
292   return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits);
293 }
294 
295 template <int kBits, typename T>
ShiftRight(const Vec1<T> v)296 HWY_INLINE Vec1<T> ShiftRight(const Vec1<T> v) {
297   static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
298 #if __cplusplus >= 202002L
299   // Signed right shift is now guaranteed to be arithmetic (rounding toward
300   // negative infinity, i.e. shifting in the sign bit).
301   return Vec1<T>(v.raw >> kBits);
302 #else
303   if (IsSigned<T>()) {
304     // Emulate arithmetic shift using only logical (unsigned) shifts, because
305     // signed shifts are still implementation-defined.
306     using TU = hwy::MakeUnsigned<T>;
307     const Sisd<TU> du;
308     const TU shifted = BitCast(du, v).raw >> kBits;
309     const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
310     const TU upper = sign << (sizeof(TU) * 8 - 1 - kBits);
311     return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
312   } else {
313     return Vec1<T>(v.raw >> kBits);  // unsigned, logical shift
314   }
315 #endif
316 }
317 
318 // ------------------------------ ShiftLeftSame (BroadcastSignBit)
319 
320 template <typename T>
ShiftLeftSame(const Vec1<T> v,int bits)321 HWY_INLINE Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
322   return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits);
323 }
324 
325 template <typename T>
ShiftRightSame(const Vec1<T> v,int bits)326 HWY_INLINE Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
327 #if __cplusplus >= 202002L
328   // Signed right shift is now guaranteed to be arithmetic (rounding toward
329   // negative infinity, i.e. shifting in the sign bit).
330   return Vec1<T>(v.raw >> bits);
331 #else
332   if (IsSigned<T>()) {
333     // Emulate arithmetic shift using only logical (unsigned) shifts, because
334     // signed shifts are still implementation-defined.
335     using TU = hwy::MakeUnsigned<T>;
336     const Sisd<TU> du;
337     const TU shifted = BitCast(du, v).raw >> bits;
338     const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
339     const TU upper = sign << (sizeof(TU) * 8 - 1 - bits);
340     return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
341   } else {
342     return Vec1<T>(v.raw >> bits);  // unsigned, logical shift
343   }
344 #endif
345 }
346 
347 // ------------------------------ Shl
348 
349 // Single-lane => same as ShiftLeftSame except for the argument type.
350 template <typename T>
351 HWY_INLINE Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) {
352   return ShiftLeftSame(v, static_cast<int>(bits.raw));
353 }
354 
355 template <typename T>
356 HWY_INLINE Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) {
357   return ShiftRightSame(v, static_cast<int>(bits.raw));
358 }
359 
360 // ================================================== ARITHMETIC
361 
362 template <typename T>
363 HWY_INLINE Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
364   const uint64_t a64 = static_cast<uint64_t>(a.raw);
365   const uint64_t b64 = static_cast<uint64_t>(b.raw);
366   return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
367 }
368 HWY_INLINE Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
369   return Vec1<float>(a.raw + b.raw);
370 }
371 HWY_INLINE Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) {
372   return Vec1<double>(a.raw + b.raw);
373 }
374 
375 template <typename T>
376 HWY_INLINE Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
377   const uint64_t a64 = static_cast<uint64_t>(a.raw);
378   const uint64_t b64 = static_cast<uint64_t>(b.raw);
379   return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
380 }
381 HWY_INLINE Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
382   return Vec1<float>(a.raw - b.raw);
383 }
384 HWY_INLINE Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
385   return Vec1<double>(a.raw - b.raw);
386 }
387 
388 // ------------------------------ Saturating addition
389 
390 // Returns a + b clamped to the destination range.
391 
392 // Unsigned
SaturatedAdd(const Vec1<uint8_t> a,const Vec1<uint8_t> b)393 HWY_INLINE Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
394                                       const Vec1<uint8_t> b) {
395   return Vec1<uint8_t>(
396       static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
397 }
SaturatedAdd(const Vec1<uint16_t> a,const Vec1<uint16_t> b)398 HWY_INLINE Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
399                                        const Vec1<uint16_t> b) {
400   return Vec1<uint16_t>(
401       static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
402 }
403 
404 // Signed
SaturatedAdd(const Vec1<int8_t> a,const Vec1<int8_t> b)405 HWY_INLINE Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a,
406                                      const Vec1<int8_t> b) {
407   return Vec1<int8_t>(
408       static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
409 }
SaturatedAdd(const Vec1<int16_t> a,const Vec1<int16_t> b)410 HWY_INLINE Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
411                                       const Vec1<int16_t> b) {
412   return Vec1<int16_t>(
413       static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
414 }
415 
416 // ------------------------------ Saturating subtraction
417 
418 // Returns a - b clamped to the destination range.
419 
420 // Unsigned
SaturatedSub(const Vec1<uint8_t> a,const Vec1<uint8_t> b)421 HWY_INLINE Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
422                                       const Vec1<uint8_t> b) {
423   return Vec1<uint8_t>(
424       static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
425 }
SaturatedSub(const Vec1<uint16_t> a,const Vec1<uint16_t> b)426 HWY_INLINE Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
427                                        const Vec1<uint16_t> b) {
428   return Vec1<uint16_t>(
429       static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
430 }
431 
432 // Signed
SaturatedSub(const Vec1<int8_t> a,const Vec1<int8_t> b)433 HWY_INLINE Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a,
434                                      const Vec1<int8_t> b) {
435   return Vec1<int8_t>(
436       static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
437 }
SaturatedSub(const Vec1<int16_t> a,const Vec1<int16_t> b)438 HWY_INLINE Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
439                                       const Vec1<int16_t> b) {
440   return Vec1<int16_t>(
441       static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
442 }
443 
444 // ------------------------------ Average
445 
446 // Returns (a + b + 1) / 2
447 
AverageRound(const Vec1<uint8_t> a,const Vec1<uint8_t> b)448 HWY_INLINE Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
449                                       const Vec1<uint8_t> b) {
450   return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
451 }
AverageRound(const Vec1<uint16_t> a,const Vec1<uint16_t> b)452 HWY_INLINE Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
453                                        const Vec1<uint16_t> b) {
454   return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
455 }
456 
457 // ------------------------------ Absolute value
458 
459 template <typename T>
Abs(const Vec1<T> a)460 HWY_INLINE Vec1<T> Abs(const Vec1<T> a) {
461   const T i = a.raw;
462   return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
463 }
Abs(const Vec1<float> a)464 HWY_INLINE Vec1<float> Abs(const Vec1<float> a) {
465   return Vec1<float>(std::abs(a.raw));
466 }
Abs(const Vec1<double> a)467 HWY_INLINE Vec1<double> Abs(const Vec1<double> a) {
468   return Vec1<double>(std::abs(a.raw));
469 }
470 
471 // ------------------------------ min/max
472 
473 template <typename T, HWY_IF_NOT_FLOAT(T)>
Min(const Vec1<T> a,const Vec1<T> b)474 HWY_INLINE Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
475   return Vec1<T>(HWY_MIN(a.raw, b.raw));
476 }
477 
478 template <typename T, HWY_IF_FLOAT(T)>
Min(const Vec1<T> a,const Vec1<T> b)479 HWY_INLINE Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
480   if (std::isnan(a.raw)) return b;
481   if (std::isnan(b.raw)) return a;
482   return Vec1<T>(HWY_MIN(a.raw, b.raw));
483 }
484 
485 template <typename T, HWY_IF_NOT_FLOAT(T)>
Max(const Vec1<T> a,const Vec1<T> b)486 HWY_INLINE Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
487   return Vec1<T>(HWY_MAX(a.raw, b.raw));
488 }
489 
490 template <typename T, HWY_IF_FLOAT(T)>
Max(const Vec1<T> a,const Vec1<T> b)491 HWY_INLINE Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
492   if (std::isnan(a.raw)) return b;
493   if (std::isnan(b.raw)) return a;
494   return Vec1<T>(HWY_MAX(a.raw, b.raw));
495 }
496 
497 // ------------------------------ Floating-point negate
498 
499 template <typename T, HWY_IF_FLOAT(T)>
Neg(const Vec1<T> v)500 HWY_INLINE Vec1<T> Neg(const Vec1<T> v) {
501   return Xor(v, SignBit(Sisd<T>()));
502 }
503 
504 template <typename T, HWY_IF_NOT_FLOAT(T)>
Neg(const Vec1<T> v)505 HWY_INLINE Vec1<T> Neg(const Vec1<T> v) {
506   return Zero(Sisd<T>()) - v;
507 }
508 
509 // ------------------------------ mul/div
510 
511 template <typename T>
512 HWY_INLINE Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
513   if (hwy::IsFloat<T>()) {
514     return Vec1<T>(static_cast<T>(double(a.raw) * b.raw));
515   } else if (hwy::IsSigned<T>()) {
516     return Vec1<T>(static_cast<T>(int64_t(a.raw) * b.raw));
517   } else {
518     return Vec1<T>(static_cast<T>(uint64_t(a.raw) * b.raw));
519   }
520 }
521 
522 template <typename T>
523 HWY_INLINE Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
524   return Vec1<T>(a.raw / b.raw);
525 }
526 
527 // Returns the upper 16 bits of a * b in each lane.
MulHigh(const Vec1<int16_t> a,const Vec1<int16_t> b)528 HWY_INLINE Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
529   return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
530 }
MulHigh(const Vec1<uint16_t> a,const Vec1<uint16_t> b)531 HWY_INLINE Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a,
532                                   const Vec1<uint16_t> b) {
533   // Cast to uint32_t first to prevent overflow. Otherwise the result of
534   // uint16_t * uint16_t is in "int" which may overflow. In practice the result
535   // is the same but this way it is also defined.
536   return Vec1<uint16_t>(static_cast<uint16_t>(
537       (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
538 }
539 
540 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
MulEven(const Vec1<int32_t> a,const Vec1<int32_t> b)541 HWY_INLINE Vec1<int64_t> MulEven(const Vec1<int32_t> a, const Vec1<int32_t> b) {
542   const int64_t a64 = a.raw;
543   return Vec1<int64_t>(a64 * b.raw);
544 }
MulEven(const Vec1<uint32_t> a,const Vec1<uint32_t> b)545 HWY_INLINE Vec1<uint64_t> MulEven(const Vec1<uint32_t> a,
546                                   const Vec1<uint32_t> b) {
547   const uint64_t a64 = a.raw;
548   return Vec1<uint64_t>(a64 * b.raw);
549 }
550 
551 // Approximate reciprocal
ApproximateReciprocal(const Vec1<float> v)552 HWY_INLINE Vec1<float> ApproximateReciprocal(const Vec1<float> v) {
553   // Zero inputs are allowed, but callers are responsible for replacing the
554   // return value with something else (typically using IfThenElse). This check
555   // avoids a ubsan error. The return value is arbitrary.
556   if (v.raw == 0.0f) return Vec1<float>(0.0f);
557   return Vec1<float>(1.0f / v.raw);
558 }
559 
560 // Absolute value of difference.
AbsDiff(const Vec1<float> a,const Vec1<float> b)561 HWY_INLINE Vec1<float> AbsDiff(const Vec1<float> a, const Vec1<float> b) {
562   return Abs(a - b);
563 }
564 
565 // ------------------------------ Floating-point multiply-add variants
566 
567 template <typename T>
MulAdd(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> add)568 HWY_INLINE Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x,
569                           const Vec1<T> add) {
570   return mul * x + add;
571 }
572 
573 template <typename T>
NegMulAdd(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> add)574 HWY_INLINE Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x,
575                              const Vec1<T> add) {
576   return add - mul * x;
577 }
578 
579 template <typename T>
MulSub(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> sub)580 HWY_INLINE Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x,
581                           const Vec1<T> sub) {
582   return mul * x - sub;
583 }
584 
585 template <typename T>
NegMulSub(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> sub)586 HWY_INLINE Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x,
587                              const Vec1<T> sub) {
588   return Neg(mul) * x - sub;
589 }
590 
591 // ------------------------------ Floating-point square root
592 
593 // Approximate reciprocal square root
ApproximateReciprocalSqrt(const Vec1<float> v)594 HWY_INLINE Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
595   float f = v.raw;
596   const float half = f * 0.5f;
597   uint32_t bits;
598   CopyBytes<4>(&f, &bits);
599   // Initial guess based on log2(f)
600   bits = 0x5F3759DF - (bits >> 1);
601   CopyBytes<4>(&bits, &f);
602   // One Newton-Raphson iteration
603   return Vec1<float>(f * (1.5f - (half * f * f)));
604 }
605 
606 // Square root
Sqrt(const Vec1<float> v)607 HWY_INLINE Vec1<float> Sqrt(const Vec1<float> v) {
608   return Vec1<float>(std::sqrt(v.raw));
609 }
Sqrt(const Vec1<double> v)610 HWY_INLINE Vec1<double> Sqrt(const Vec1<double> v) {
611   return Vec1<double>(std::sqrt(v.raw));
612 }
613 
614 // ------------------------------ Floating-point rounding
615 
616 template <typename T>
Round(const Vec1<T> v)617 HWY_INLINE Vec1<T> Round(const Vec1<T> v) {
618   using TI = MakeSigned<T>;
619   if (!(Abs(v).raw < MantissaEnd<T>())) {  // Huge or NaN
620     return v;
621   }
622   const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
623   const TI rounded = static_cast<TI>(v.raw + bias);
624   if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
625   // Round to even
626   if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
627     return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
628   }
629   return Vec1<T>(static_cast<T>(rounded));
630 }
631 
632 // Round-to-nearest even.
NearestInt(const Vec1<float> v)633 HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
634   using T = float;
635   using TI = int32_t;
636 
637   const T abs = Abs(v).raw;
638   const bool signbit = std::signbit(v.raw);
639 
640   if (!(abs < MantissaEnd<T>())) {  // Huge or NaN
641     // Check if too large to cast or NaN
642     if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
643       return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
644     }
645     return Vec1<int32_t>(static_cast<TI>(v.raw));
646   }
647   const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
648   const TI rounded = static_cast<TI>(v.raw + bias);
649   if (rounded == 0) return Vec1<int32_t>(0);
650   // Round to even
651   if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
652     return Vec1<TI>(rounded - (signbit ? -1 : 1));
653   }
654   return Vec1<TI>(rounded);
655 }
656 
657 template <typename T>
Trunc(const Vec1<T> v)658 HWY_INLINE Vec1<T> Trunc(const Vec1<T> v) {
659   using TI = MakeSigned<T>;
660   if (!(Abs(v).raw <= MantissaEnd<T>())) {  // Huge or NaN
661     return v;
662   }
663   const TI truncated = static_cast<TI>(v.raw);
664   if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
665   return Vec1<T>(static_cast<T>(truncated));
666 }
667 
668 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
669           class V>
Ceiling(const V v)670 V Ceiling(const V v) {
671   const Bits kExponentMask = (1ull << kExponentBits) - 1;
672   const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
673   const Bits kBias = kExponentMask / 2;
674 
675   Float f = v.raw;
676   const bool positive = f > Float(0.0);
677 
678   Bits bits;
679   CopyBytes<sizeof(Bits)>(&v, &bits);
680 
681   const int exponent =
682       static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
683   // Already an integer.
684   if (exponent >= kMantissaBits) return v;
685   // |v| <= 1 => 0 or 1.
686   if (exponent < 0) return positive ? V(1) : V(-0.0);
687 
688   const Bits mantissa_mask = kMantissaMask >> exponent;
689   // Already an integer
690   if ((bits & mantissa_mask) == 0) return v;
691 
692   // Clear fractional bits and round up
693   if (positive) bits += (kMantissaMask + 1) >> exponent;
694   bits &= ~mantissa_mask;
695 
696   CopyBytes<sizeof(Bits)>(&bits, &f);
697   return V(f);
698 }
699 
700 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
701           class V>
Floor(const V v)702 V Floor(const V v) {
703   const Bits kExponentMask = (1ull << kExponentBits) - 1;
704   const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
705   const Bits kBias = kExponentMask / 2;
706 
707   Float f = v.raw;
708   const bool negative = f < Float(0.0);
709 
710   Bits bits;
711   CopyBytes<sizeof(Bits)>(&v, &bits);
712 
713   const int exponent =
714       static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
715   // Already an integer.
716   if (exponent >= kMantissaBits) return v;
717   // |v| <= 1 => -1 or 0.
718   if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
719 
720   const Bits mantissa_mask = kMantissaMask >> exponent;
721   // Already an integer
722   if ((bits & mantissa_mask) == 0) return v;
723 
724   // Clear fractional bits and round down
725   if (negative) bits += (kMantissaMask + 1) >> exponent;
726   bits &= ~mantissa_mask;
727 
728   CopyBytes<sizeof(Bits)>(&bits, &f);
729   return V(f);
730 }
731 
732 // Toward +infinity, aka ceiling
Ceil(const Vec1<float> v)733 HWY_INLINE Vec1<float> Ceil(const Vec1<float> v) {
734   return Ceiling<float, uint32_t, 23, 8>(v);
735 }
Ceil(const Vec1<double> v)736 HWY_INLINE Vec1<double> Ceil(const Vec1<double> v) {
737   return Ceiling<double, uint64_t, 52, 11>(v);
738 }
739 
740 // Toward -infinity, aka floor
Floor(const Vec1<float> v)741 HWY_INLINE Vec1<float> Floor(const Vec1<float> v) {
742   return Floor<float, uint32_t, 23, 8>(v);
743 }
Floor(const Vec1<double> v)744 HWY_INLINE Vec1<double> Floor(const Vec1<double> v) {
745   return Floor<double, uint64_t, 52, 11>(v);
746 }
747 
748 // ================================================== COMPARE
749 
750 template <typename T>
751 HWY_INLINE Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) {
752   return Mask1<T>::FromBool(a.raw == b.raw);
753 }
754 
755 template <typename T>
TestBit(const Vec1<T> v,const Vec1<T> bit)756 HWY_INLINE Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) {
757   static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
758   return (v & bit) == bit;
759 }
760 
761 template <typename T>
762 HWY_INLINE Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) {
763   return Mask1<T>::FromBool(a.raw < b.raw);
764 }
765 template <typename T>
766 HWY_INLINE Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) {
767   return Mask1<T>::FromBool(a.raw > b.raw);
768 }
769 
770 template <typename T>
771 HWY_INLINE Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) {
772   return Mask1<T>::FromBool(a.raw <= b.raw);
773 }
774 template <typename T>
775 HWY_INLINE Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
776   return Mask1<T>::FromBool(a.raw >= b.raw);
777 }
778 
779 // ================================================== MEMORY
780 
781 // ------------------------------ Load
782 
783 template <typename T>
Load(Sisd<T>,const T * HWY_RESTRICT aligned)784 HWY_INLINE Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
785   T t;
786   CopyBytes<sizeof(T)>(aligned, &t);
787   return Vec1<T>(t);
788 }
789 
790 template <typename T>
LoadU(Sisd<T> d,const T * HWY_RESTRICT p)791 HWY_INLINE Vec1<T> LoadU(Sisd<T> d, const T* HWY_RESTRICT p) {
792   return Load(d, p);
793 }
794 
795 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
796 template <typename T>
LoadDup128(Sisd<T> d,const T * HWY_RESTRICT aligned)797 HWY_INLINE Vec1<T> LoadDup128(Sisd<T> d, const T* HWY_RESTRICT aligned) {
798   return Load(d, aligned);
799 }
800 
801 // ------------------------------ Store
802 
803 template <typename T>
Store(const Vec1<T> v,Sisd<T>,T * HWY_RESTRICT aligned)804 HWY_INLINE void Store(const Vec1<T> v, Sisd<T> /* tag */,
805                       T* HWY_RESTRICT aligned) {
806   CopyBytes<sizeof(T)>(&v.raw, aligned);
807 }
808 
809 template <typename T>
StoreU(const Vec1<T> v,Sisd<T> d,T * HWY_RESTRICT p)810 HWY_INLINE void StoreU(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT p) {
811   return Store(v, d, p);
812 }
813 
814 // ------------------------------ StoreInterleaved3
815 
StoreInterleaved3(const Vec1<uint8_t> v0,const Vec1<uint8_t> v1,const Vec1<uint8_t> v2,Sisd<uint8_t> d,uint8_t * HWY_RESTRICT unaligned)816 HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
817                                const Vec1<uint8_t> v2, Sisd<uint8_t> d,
818                                uint8_t* HWY_RESTRICT unaligned) {
819   StoreU(v0, d, unaligned + 0);
820   StoreU(v1, d, unaligned + 1);
821   StoreU(v2, d, unaligned + 2);
822 }
823 
StoreInterleaved4(const Vec1<uint8_t> v0,const Vec1<uint8_t> v1,const Vec1<uint8_t> v2,const Vec1<uint8_t> v3,Sisd<uint8_t> d,uint8_t * HWY_RESTRICT unaligned)824 HWY_API void StoreInterleaved4(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
825                                const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
826                                Sisd<uint8_t> d,
827                                uint8_t* HWY_RESTRICT unaligned) {
828   StoreU(v0, d, unaligned + 0);
829   StoreU(v1, d, unaligned + 1);
830   StoreU(v2, d, unaligned + 2);
831   StoreU(v3, d, unaligned + 3);
832 }
833 
834 // ------------------------------ Stream
835 
836 template <typename T>
Stream(const Vec1<T> v,Sisd<T> d,T * HWY_RESTRICT aligned)837 HWY_INLINE void Stream(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT aligned) {
838   return Store(v, d, aligned);
839 }
840 
841 // ------------------------------ Scatter
842 
843 template <typename T, typename Offset>
ScatterOffset(Vec1<T> v,Sisd<T> d,T * base,const Vec1<Offset> offset)844 HWY_INLINE void ScatterOffset(Vec1<T> v, Sisd<T> d, T* base,
845                               const Vec1<Offset> offset) {
846   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
847   uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
848   return Store(v, d, reinterpret_cast<T*>(base8));
849 }
850 
851 template <typename T, typename Index>
ScatterIndex(Vec1<T> v,Sisd<T> d,T * HWY_RESTRICT base,const Vec1<Index> index)852 HWY_INLINE void ScatterIndex(Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT base,
853                              const Vec1<Index> index) {
854   static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
855   return Store(v, d, base + index.raw);
856 }
857 
858 // ------------------------------ Gather
859 
860 template <typename T, typename Offset>
GatherOffset(Sisd<T> d,const T * base,const Vec1<Offset> offset)861 HWY_INLINE Vec1<T> GatherOffset(Sisd<T> d, const T* base,
862                                 const Vec1<Offset> offset) {
863   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
864   const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
865   return Load(d, reinterpret_cast<const T*>(addr));
866 }
867 
868 template <typename T, typename Index>
GatherIndex(Sisd<T> d,const T * HWY_RESTRICT base,const Vec1<Index> index)869 HWY_INLINE Vec1<T> GatherIndex(Sisd<T> d, const T* HWY_RESTRICT base,
870                                const Vec1<Index> index) {
871   static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
872   return Load(d, base + index.raw);
873 }
874 
875 // ================================================== CONVERT
876 
877 // ConvertTo and DemoteTo with floating-point input and integer output truncate
878 // (rounding toward zero).
879 
880 template <typename FromT, typename ToT>
PromoteTo(Sisd<ToT>,Vec1<FromT> from)881 HWY_INLINE Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
882   static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
883   // For bits Y > X, floatX->floatY and intX->intY are always representable.
884   return Vec1<ToT>(static_cast<ToT>(from.raw));
885 }
886 
887 template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
DemoteTo(Sisd<ToT>,Vec1<FromT> from)888 HWY_INLINE Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
889   static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
890 
891   // Prevent ubsan errors when converting float to narrower integer/float
892   if (std::isinf(from.raw) ||
893       std::fabs(from.raw) > static_cast<FromT>(HighestValue<ToT>())) {
894     return Vec1<ToT>(std::signbit(from.raw) ? LowestValue<ToT>()
895                                             : HighestValue<ToT>());
896   }
897   return Vec1<ToT>(static_cast<ToT>(from.raw));
898 }
899 
900 template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
DemoteTo(Sisd<ToT>,Vec1<FromT> from)901 HWY_INLINE Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
902   static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
903 
904   // Int to int: choose closest value in ToT to `from` (avoids UB)
905   from.raw = std::min<FromT>(std::max<FromT>(LimitsMin<ToT>(), from.raw),
906                              LimitsMax<ToT>());
907   return Vec1<ToT>(static_cast<ToT>(from.raw));
908 }
909 
PromoteTo(Sisd<float>,const Vec1<float16_t> v)910 static HWY_INLINE Vec1<float> PromoteTo(Sisd<float> /* tag */,
911                                         const Vec1<float16_t> v) {
912 #if HWY_NATIVE_FLOAT16
913   uint16_t bits16;
914   CopyBytes<2>(&v.raw, &bits16);
915 #else
916   const uint16_t bits16 = v.raw.bits;
917 #endif
918   const uint32_t sign = bits16 >> 15;
919   const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
920   const uint32_t mantissa = bits16 & 0x3FF;
921 
922   // Subnormal or zero
923   if (biased_exp == 0) {
924     const float subnormal =
925         (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
926     return Vec1<float>(sign ? -subnormal : subnormal);
927   }
928 
929   // Normalized: convert the representation directly (faster than ldexp/tables).
930   const uint32_t biased_exp32 = biased_exp + (127 - 15);
931   const uint32_t mantissa32 = mantissa << (23 - 10);
932   const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
933   float out;
934   CopyBytes<4>(&bits32, &out);
935   return Vec1<float>(out);
936 }
937 
DemoteTo(Sisd<float16_t>,const Vec1<float> v)938 static HWY_INLINE Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
939                                            const Vec1<float> v) {
940   uint32_t bits32;
941   CopyBytes<4>(&v.raw, &bits32);
942   const uint32_t sign = bits32 >> 31;
943   const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
944   const uint32_t mantissa32 = bits32 & 0x7FFFFF;
945 
946   const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
947 
948   // Tiny or zero => zero.
949   Vec1<float16_t> out;
950   if (exp < -24) {
951 #if HWY_NATIVE_FLOAT16
952     const uint16_t zero = 0;
953     CopyBytes<2>(&zero, &out.raw);
954 #else
955     out.raw.bits = 0;
956 #endif
957     return out;
958   }
959 
960   uint32_t biased_exp16, mantissa16;
961 
962   // exp = [-24, -15] => subnormal
963   if (exp < -14) {
964     biased_exp16 = 0;
965     const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
966     HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
967     mantissa16 = (1 << (10 - sub_exp)) + (mantissa32 >> (13 + sub_exp));
968   } else {
969     // exp = [-14, 15]
970     biased_exp16 = static_cast<uint32_t>(exp + 15);
971     HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
972     mantissa16 = mantissa32 >> 13;
973   }
974 
975   HWY_DASSERT(mantissa16 < 1024);
976   const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
977   HWY_DASSERT(bits16 < 0x10000);
978 #if HWY_NATIVE_FLOAT16
979   const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
980   CopyBytes<2>(&narrowed, &out.raw);
981 #else
982   out.raw.bits = static_cast<uint16_t>(bits16);
983 #endif
984   return out;
985 }
986 
987 template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
ConvertTo(Sisd<ToT>,Vec1<FromT> from)988 HWY_INLINE Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
989   static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
990   // float## -> int##: return closest representable value. We cannot exactly
991   // represent LimitsMax<ToT> in FromT, so use double.
992   const double f = static_cast<double>(from.raw);
993   if (std::isinf(from.raw) ||
994       std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
995     return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
996                                             : LimitsMax<ToT>());
997   }
998   return Vec1<ToT>(static_cast<ToT>(from.raw));
999 }
1000 
1001 template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
ConvertTo(Sisd<ToT>,Vec1<FromT> from)1002 HWY_INLINE Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
1003   static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1004   // int## -> float##: no check needed
1005   return Vec1<ToT>(static_cast<ToT>(from.raw));
1006 }
1007 
U8FromU32(const Vec1<uint32_t> v)1008 HWY_INLINE Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
1009   return DemoteTo(Sisd<uint8_t>(), v);
1010 }
1011 
1012 // ================================================== SWIZZLE
1013 
1014 // Unsupported: Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle*,
1015 // UpperHalf - these require more than one lane and/or actual 128-bit vectors.
1016 
1017 template <typename T>
GetLane(const Vec1<T> v)1018 HWY_INLINE T GetLane(const Vec1<T> v) {
1019   return v.raw;
1020 }
1021 
1022 template <typename T>
LowerHalf(Vec1<T> v)1023 HWY_INLINE Vec1<T> LowerHalf(Vec1<T> v) {
1024   return v;
1025 }
1026 
1027 // ------------------------------ Broadcast/splat any lane
1028 
1029 template <int kLane, typename T>
Broadcast(const Vec1<T> v)1030 HWY_INLINE Vec1<T> Broadcast(const Vec1<T> v) {
1031   static_assert(kLane == 0, "Scalar only has one lane");
1032   return v;
1033 }
1034 
1035 // ------------------------------ Shuffle bytes with variable indices
1036 
1037 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
1038 // indices in [0, sizeof(T)).
1039 template <typename T>
TableLookupBytes(const Vec1<T> in,const Vec1<T> from)1040 HWY_API Vec1<T> TableLookupBytes(const Vec1<T> in, const Vec1<T> from) {
1041   uint8_t in_bytes[sizeof(T)];
1042   uint8_t from_bytes[sizeof(T)];
1043   uint8_t out_bytes[sizeof(T)];
1044   CopyBytes<sizeof(T)>(&in, &in_bytes);
1045   CopyBytes<sizeof(T)>(&from, &from_bytes);
1046   for (size_t i = 0; i < sizeof(T); ++i) {
1047     out_bytes[i] = in_bytes[from_bytes[i]];
1048   }
1049   T out;
1050   CopyBytes<sizeof(T)>(&out_bytes, &out);
1051   return Vec1<T>{out};
1052 }
1053 
1054 // ------------------------------ TableLookupLanes
1055 
1056 // Returned by SetTableIndices for use by TableLookupLanes.
1057 template <typename T>
1058 struct Indices1 {
1059   int raw;
1060 };
1061 
1062 template <typename T>
SetTableIndices(Sisd<T>,const int32_t * idx)1063 HWY_API Indices1<T> SetTableIndices(Sisd<T>, const int32_t* idx) {
1064 #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
1065   HWY_DASSERT(idx[0] == 0);
1066 #endif
1067   return Indices1<T>{idx[0]};
1068 }
1069 
1070 template <typename T>
TableLookupLanes(const Vec1<T> v,const Indices1<T>)1071 HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) {
1072   return v;
1073 }
1074 
1075 // ------------------------------ Zip/unpack
1076 
ZipLower(const Vec1<uint8_t> a,const Vec1<uint8_t> b)1077 HWY_INLINE Vec1<uint16_t> ZipLower(const Vec1<uint8_t> a,
1078                                    const Vec1<uint8_t> b) {
1079   return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t(b.raw) << 8) + a.raw));
1080 }
ZipLower(const Vec1<uint16_t> a,const Vec1<uint16_t> b)1081 HWY_INLINE Vec1<uint32_t> ZipLower(const Vec1<uint16_t> a,
1082                                    const Vec1<uint16_t> b) {
1083   return Vec1<uint32_t>((uint32_t(b.raw) << 16) + a.raw);
1084 }
ZipLower(const Vec1<uint32_t> a,const Vec1<uint32_t> b)1085 HWY_INLINE Vec1<uint64_t> ZipLower(const Vec1<uint32_t> a,
1086                                    const Vec1<uint32_t> b) {
1087   return Vec1<uint64_t>((uint64_t(b.raw) << 32) + a.raw);
1088 }
ZipLower(const Vec1<int8_t> a,const Vec1<int8_t> b)1089 HWY_INLINE Vec1<int16_t> ZipLower(const Vec1<int8_t> a, const Vec1<int8_t> b) {
1090   return Vec1<int16_t>(static_cast<int16_t>((int32_t(b.raw) << 8) + a.raw));
1091 }
ZipLower(const Vec1<int16_t> a,const Vec1<int16_t> b)1092 HWY_INLINE Vec1<int32_t> ZipLower(const Vec1<int16_t> a,
1093                                   const Vec1<int16_t> b) {
1094   return Vec1<int32_t>((int32_t(b.raw) << 16) + a.raw);
1095 }
ZipLower(const Vec1<int32_t> a,const Vec1<int32_t> b)1096 HWY_INLINE Vec1<int64_t> ZipLower(const Vec1<int32_t> a,
1097                                   const Vec1<int32_t> b) {
1098   return Vec1<int64_t>((int64_t(b.raw) << 32) + a.raw);
1099 }
1100 
1101 // ------------------------------ Mask
1102 
1103 template <typename T>
AllFalse(const Mask1<T> mask)1104 HWY_INLINE bool AllFalse(const Mask1<T> mask) {
1105   return mask.bits == 0;
1106 }
1107 
1108 template <typename T>
AllTrue(const Mask1<T> mask)1109 HWY_INLINE bool AllTrue(const Mask1<T> mask) {
1110   return mask.bits != 0;
1111 }
1112 
1113 template <typename T>
StoreMaskBits(const Mask1<T> mask,uint8_t * p)1114 HWY_INLINE size_t StoreMaskBits(const Mask1<T> mask, uint8_t* p) {
1115   *p = AllTrue(mask);
1116   return 1;
1117 }
1118 template <typename T>
CountTrue(const Mask1<T> mask)1119 HWY_INLINE size_t CountTrue(const Mask1<T> mask) {
1120   return mask.bits == 0 ? 0 : 1;
1121 }
1122 
1123 template <typename T>
Compress(Vec1<T> v,const Mask1<T>)1124 HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
1125   // Upper lanes are undefined, so result is the same independent of mask.
1126   return v;
1127 }
1128 
1129 // ------------------------------ CompressStore
1130 
1131 template <typename T>
CompressStore(Vec1<T> v,const Mask1<T> mask,Sisd<T> d,T * HWY_RESTRICT aligned)1132 HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
1133                              T* HWY_RESTRICT aligned) {
1134   Store(Compress(v, mask), d, aligned);
1135   return CountTrue(mask);
1136 }
1137 
1138 // ------------------------------ Reductions
1139 
1140 // Sum of all lanes, i.e. the only one.
1141 template <typename T>
SumOfLanes(const Vec1<T> v0)1142 HWY_INLINE Vec1<T> SumOfLanes(const Vec1<T> v0) {
1143   return v0;
1144 }
1145 template <typename T>
MinOfLanes(const Vec1<T> v)1146 HWY_INLINE Vec1<T> MinOfLanes(const Vec1<T> v) {
1147   return v;
1148 }
1149 template <typename T>
MaxOfLanes(const Vec1<T> v)1150 HWY_INLINE Vec1<T> MaxOfLanes(const Vec1<T> v) {
1151   return v;
1152 }
1153 
1154 // ================================================== Operator wrapper
1155 
1156 template <class V>
Add(V a,V b)1157 HWY_API V Add(V a, V b) {
1158   return a + b;
1159 }
1160 template <class V>
Sub(V a,V b)1161 HWY_API V Sub(V a, V b) {
1162   return a - b;
1163 }
1164 
1165 template <class V>
Mul(V a,V b)1166 HWY_API V Mul(V a, V b) {
1167   return a * b;
1168 }
1169 template <class V>
Div(V a,V b)1170 HWY_API V Div(V a, V b) {
1171   return a / b;
1172 }
1173 
1174 template <class V>
Shl(V a,V b)1175 V Shl(V a, V b) {
1176   return a << b;
1177 }
1178 template <class V>
Shr(V a,V b)1179 V Shr(V a, V b) {
1180   return a >> b;
1181 }
1182 
1183 template <class V>
1184 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1185   return a == b;
1186 }
1187 template <class V>
1188 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1189   return a < b;
1190 }
1191 
1192 template <class V>
1193 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1194   return a > b;
1195 }
1196 template <class V>
1197 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1198   return a >= b;
1199 }
1200 
1201 template <class V>
1202 HWY_API auto Le(V a, V b) -> decltype(a == b) {
1203   return a <= b;
1204 }
1205 
1206 // NOLINTNEXTLINE(google-readability-namespace-comments)
1207 }  // namespace HWY_NAMESPACE
1208 }  // namespace hwy
1209 HWY_AFTER_NAMESPACE();
1210