1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // Single-element vectors and operations.
16 // External include guard in highway.h - see comment there.
17
18 #include <stddef.h>
19 #include <stdint.h>
20
21 #include <algorithm> // std::min
22
23 #include "hwy/base.h"
24 #include "hwy/ops/shared-inl.h"
25
26 HWY_BEFORE_NAMESPACE();
27 namespace hwy {
28 namespace HWY_NAMESPACE {
29
30 // Single instruction, single data.
31 template <typename T>
32 using Sisd = Simd<T, 1>;
33
34 // (Wrapper class required for overloading comparison operators.)
35 template <typename T>
36 struct Vec1 {
37 HWY_INLINE Vec1() = default;
38 Vec1(const Vec1&) = default;
39 Vec1& operator=(const Vec1&) = default;
Vec1Vec140 HWY_INLINE explicit Vec1(const T t) : raw(t) {}
41
42 HWY_INLINE Vec1& operator*=(const Vec1 other) {
43 return *this = (*this * other);
44 }
45 HWY_INLINE Vec1& operator/=(const Vec1 other) {
46 return *this = (*this / other);
47 }
48 HWY_INLINE Vec1& operator+=(const Vec1 other) {
49 return *this = (*this + other);
50 }
51 HWY_INLINE Vec1& operator-=(const Vec1 other) {
52 return *this = (*this - other);
53 }
54 HWY_INLINE Vec1& operator&=(const Vec1 other) {
55 return *this = (*this & other);
56 }
57 HWY_INLINE Vec1& operator|=(const Vec1 other) {
58 return *this = (*this | other);
59 }
60 HWY_INLINE Vec1& operator^=(const Vec1 other) {
61 return *this = (*this ^ other);
62 }
63
64 T raw;
65 };
66
67 // 0 or FF..FF, same size as Vec1.
68 template <typename T>
69 class Mask1 {
70 using Raw = hwy::MakeUnsigned<T>;
71
72 public:
FromBool(bool b)73 static HWY_INLINE Mask1<T> FromBool(bool b) {
74 Mask1<T> mask;
75 mask.bits = b ? ~Raw(0) : 0;
76 return mask;
77 }
78
79 Raw bits;
80 };
81
82 // ------------------------------ BitCast
83
84 template <typename T, typename FromT>
BitCast(Sisd<T>,Vec1<FromT> v)85 HWY_INLINE Vec1<T> BitCast(Sisd<T> /* tag */, Vec1<FromT> v) {
86 static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
87 T to;
88 CopyBytes<sizeof(FromT)>(&v.raw, &to);
89 return Vec1<T>(to);
90 }
91
92 // ------------------------------ Set
93
94 template <typename T>
Zero(Sisd<T>)95 HWY_INLINE Vec1<T> Zero(Sisd<T> /* tag */) {
96 return Vec1<T>(T(0));
97 }
98
99 template <typename T, typename T2>
Set(Sisd<T>,const T2 t)100 HWY_INLINE Vec1<T> Set(Sisd<T> /* tag */, const T2 t) {
101 return Vec1<T>(static_cast<T>(t));
102 }
103
104 template <typename T>
Undefined(Sisd<T> d)105 HWY_INLINE Vec1<T> Undefined(Sisd<T> d) {
106 return Zero(d);
107 }
108
109 template <typename T, typename T2>
Iota(const Sisd<T>,const T2 first)110 Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
111 return Vec1<T>(static_cast<T>(first));
112 }
113
114 // ================================================== LOGICAL
115
116 // ------------------------------ Not
117
118 template <typename T>
Not(const Vec1<T> v)119 HWY_INLINE Vec1<T> Not(const Vec1<T> v) {
120 using TU = MakeUnsigned<T>;
121 const Sisd<TU> du;
122 return BitCast(Sisd<T>(), Vec1<TU>(~BitCast(du, v).raw));
123 }
124
125 // ------------------------------ And
126
127 template <typename T>
And(const Vec1<T> a,const Vec1<T> b)128 HWY_INLINE Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
129 using TU = MakeUnsigned<T>;
130 const Sisd<TU> du;
131 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
132 }
133 template <typename T>
134 HWY_INLINE Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) {
135 return And(a, b);
136 }
137
138 // ------------------------------ AndNot
139
140 template <typename T>
AndNot(const Vec1<T> a,const Vec1<T> b)141 HWY_INLINE Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
142 using TU = MakeUnsigned<T>;
143 const Sisd<TU> du;
144 return BitCast(Sisd<T>(), Vec1<TU>(~BitCast(du, a).raw & BitCast(du, b).raw));
145 }
146
147 // ------------------------------ Or
148
149 template <typename T>
Or(const Vec1<T> a,const Vec1<T> b)150 HWY_INLINE Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
151 using TU = MakeUnsigned<T>;
152 const Sisd<TU> du;
153 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
154 }
155 template <typename T>
156 HWY_INLINE Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) {
157 return Or(a, b);
158 }
159
160 // ------------------------------ Xor
161
162 template <typename T>
Xor(const Vec1<T> a,const Vec1<T> b)163 HWY_INLINE Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
164 using TU = MakeUnsigned<T>;
165 const Sisd<TU> du;
166 return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
167 }
168 template <typename T>
169 HWY_INLINE Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
170 return Xor(a, b);
171 }
172
173 // ------------------------------ CopySign
174
175 template <typename T>
CopySign(const Vec1<T> magn,const Vec1<T> sign)176 HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
177 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
178 const auto msb = SignBit(Sisd<T>());
179 return Or(AndNot(msb, magn), And(msb, sign));
180 }
181
182 template <typename T>
CopySignToAbs(const Vec1<T> abs,const Vec1<T> sign)183 HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
184 static_assert(IsFloat<T>(), "Only makes sense for floating-point");
185 return Or(abs, And(SignBit(Sisd<T>()), sign));
186 }
187
188 // ------------------------------ BroadcastSignBit
189
190 template <typename T>
BroadcastSignBit(const Vec1<T> v)191 HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) {
192 // This is used inside ShiftRight, so we cannot implement in terms of it.
193 return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
194 }
195
196 // ------------------------------ Mask
197
198 template <typename TFrom, typename TTo>
RebindMask(Sisd<TTo>,Mask1<TFrom> m)199 HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
200 static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
201 return Mask1<TTo>{m.bits};
202 }
203
204 // v must be 0 or FF..FF.
205 template <typename T>
MaskFromVec(const Vec1<T> v)206 HWY_INLINE Mask1<T> MaskFromVec(const Vec1<T> v) {
207 Mask1<T> mask;
208 CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
209 return mask;
210 }
211
212 template <typename T>
VecFromMask(const Mask1<T> mask)213 Vec1<T> VecFromMask(const Mask1<T> mask) {
214 Vec1<T> v;
215 CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
216 return v;
217 }
218
219 template <typename T>
VecFromMask(Sisd<T>,const Mask1<T> mask)220 Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
221 Vec1<T> v;
222 CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
223 return v;
224 }
225
226 template <typename T>
FirstN(Sisd<T>,size_t n)227 HWY_INLINE Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
228 return Mask1<T>::FromBool(n != 0);
229 }
230
231 // Returns mask ? yes : no.
232 template <typename T>
IfThenElse(const Mask1<T> mask,const Vec1<T> yes,const Vec1<T> no)233 HWY_INLINE Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
234 const Vec1<T> no) {
235 return mask.bits ? yes : no;
236 }
237
238 template <typename T>
IfThenElseZero(const Mask1<T> mask,const Vec1<T> yes)239 HWY_INLINE Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) {
240 return mask.bits ? yes : Vec1<T>(0);
241 }
242
243 template <typename T>
IfThenZeroElse(const Mask1<T> mask,const Vec1<T> no)244 HWY_INLINE Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
245 return mask.bits ? Vec1<T>(0) : no;
246 }
247
248 template <typename T>
ZeroIfNegative(const Vec1<T> v)249 HWY_INLINE Vec1<T> ZeroIfNegative(const Vec1<T> v) {
250 return v.raw < 0 ? Vec1<T>(0) : v;
251 }
252
253 // ------------------------------ Mask logical
254
255 template <typename T>
Not(const Mask1<T> m)256 HWY_API Mask1<T> Not(const Mask1<T> m) {
257 const Sisd<T> d;
258 return MaskFromVec(Not(VecFromMask(d, m)));
259 }
260
261 template <typename T>
And(const Mask1<T> a,Mask1<T> b)262 HWY_API Mask1<T> And(const Mask1<T> a, Mask1<T> b) {
263 const Sisd<T> d;
264 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
265 }
266
267 template <typename T>
AndNot(const Mask1<T> a,Mask1<T> b)268 HWY_API Mask1<T> AndNot(const Mask1<T> a, Mask1<T> b) {
269 const Sisd<T> d;
270 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
271 }
272
273 template <typename T>
Or(const Mask1<T> a,Mask1<T> b)274 HWY_API Mask1<T> Or(const Mask1<T> a, Mask1<T> b) {
275 const Sisd<T> d;
276 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
277 }
278
279 template <typename T>
Xor(const Mask1<T> a,Mask1<T> b)280 HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
281 const Sisd<T> d;
282 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
283 }
284
285 // ================================================== SHIFTS
286
287 // ------------------------------ ShiftLeft (BroadcastSignBit)
288
289 template <int kBits, typename T>
ShiftLeft(const Vec1<T> v)290 HWY_INLINE Vec1<T> ShiftLeft(const Vec1<T> v) {
291 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
292 return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits);
293 }
294
295 template <int kBits, typename T>
ShiftRight(const Vec1<T> v)296 HWY_INLINE Vec1<T> ShiftRight(const Vec1<T> v) {
297 static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
298 #if __cplusplus >= 202002L
299 // Signed right shift is now guaranteed to be arithmetic (rounding toward
300 // negative infinity, i.e. shifting in the sign bit).
301 return Vec1<T>(v.raw >> kBits);
302 #else
303 if (IsSigned<T>()) {
304 // Emulate arithmetic shift using only logical (unsigned) shifts, because
305 // signed shifts are still implementation-defined.
306 using TU = hwy::MakeUnsigned<T>;
307 const Sisd<TU> du;
308 const TU shifted = BitCast(du, v).raw >> kBits;
309 const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
310 const TU upper = sign << (sizeof(TU) * 8 - 1 - kBits);
311 return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
312 } else {
313 return Vec1<T>(v.raw >> kBits); // unsigned, logical shift
314 }
315 #endif
316 }
317
318 // ------------------------------ ShiftLeftSame (BroadcastSignBit)
319
320 template <typename T>
ShiftLeftSame(const Vec1<T> v,int bits)321 HWY_INLINE Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
322 return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits);
323 }
324
325 template <typename T>
ShiftRightSame(const Vec1<T> v,int bits)326 HWY_INLINE Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
327 #if __cplusplus >= 202002L
328 // Signed right shift is now guaranteed to be arithmetic (rounding toward
329 // negative infinity, i.e. shifting in the sign bit).
330 return Vec1<T>(v.raw >> bits);
331 #else
332 if (IsSigned<T>()) {
333 // Emulate arithmetic shift using only logical (unsigned) shifts, because
334 // signed shifts are still implementation-defined.
335 using TU = hwy::MakeUnsigned<T>;
336 const Sisd<TU> du;
337 const TU shifted = BitCast(du, v).raw >> bits;
338 const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
339 const TU upper = sign << (sizeof(TU) * 8 - 1 - bits);
340 return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
341 } else {
342 return Vec1<T>(v.raw >> bits); // unsigned, logical shift
343 }
344 #endif
345 }
346
347 // ------------------------------ Shl
348
349 // Single-lane => same as ShiftLeftSame except for the argument type.
350 template <typename T>
351 HWY_INLINE Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) {
352 return ShiftLeftSame(v, static_cast<int>(bits.raw));
353 }
354
355 template <typename T>
356 HWY_INLINE Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) {
357 return ShiftRightSame(v, static_cast<int>(bits.raw));
358 }
359
360 // ================================================== ARITHMETIC
361
362 template <typename T>
363 HWY_INLINE Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
364 const uint64_t a64 = static_cast<uint64_t>(a.raw);
365 const uint64_t b64 = static_cast<uint64_t>(b.raw);
366 return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
367 }
368 HWY_INLINE Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
369 return Vec1<float>(a.raw + b.raw);
370 }
371 HWY_INLINE Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) {
372 return Vec1<double>(a.raw + b.raw);
373 }
374
375 template <typename T>
376 HWY_INLINE Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
377 const uint64_t a64 = static_cast<uint64_t>(a.raw);
378 const uint64_t b64 = static_cast<uint64_t>(b.raw);
379 return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
380 }
381 HWY_INLINE Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
382 return Vec1<float>(a.raw - b.raw);
383 }
384 HWY_INLINE Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
385 return Vec1<double>(a.raw - b.raw);
386 }
387
388 // ------------------------------ Saturating addition
389
390 // Returns a + b clamped to the destination range.
391
392 // Unsigned
SaturatedAdd(const Vec1<uint8_t> a,const Vec1<uint8_t> b)393 HWY_INLINE Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
394 const Vec1<uint8_t> b) {
395 return Vec1<uint8_t>(
396 static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
397 }
SaturatedAdd(const Vec1<uint16_t> a,const Vec1<uint16_t> b)398 HWY_INLINE Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
399 const Vec1<uint16_t> b) {
400 return Vec1<uint16_t>(
401 static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
402 }
403
404 // Signed
SaturatedAdd(const Vec1<int8_t> a,const Vec1<int8_t> b)405 HWY_INLINE Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a,
406 const Vec1<int8_t> b) {
407 return Vec1<int8_t>(
408 static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
409 }
SaturatedAdd(const Vec1<int16_t> a,const Vec1<int16_t> b)410 HWY_INLINE Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
411 const Vec1<int16_t> b) {
412 return Vec1<int16_t>(
413 static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
414 }
415
416 // ------------------------------ Saturating subtraction
417
418 // Returns a - b clamped to the destination range.
419
420 // Unsigned
SaturatedSub(const Vec1<uint8_t> a,const Vec1<uint8_t> b)421 HWY_INLINE Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
422 const Vec1<uint8_t> b) {
423 return Vec1<uint8_t>(
424 static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
425 }
SaturatedSub(const Vec1<uint16_t> a,const Vec1<uint16_t> b)426 HWY_INLINE Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
427 const Vec1<uint16_t> b) {
428 return Vec1<uint16_t>(
429 static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
430 }
431
432 // Signed
SaturatedSub(const Vec1<int8_t> a,const Vec1<int8_t> b)433 HWY_INLINE Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a,
434 const Vec1<int8_t> b) {
435 return Vec1<int8_t>(
436 static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
437 }
SaturatedSub(const Vec1<int16_t> a,const Vec1<int16_t> b)438 HWY_INLINE Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
439 const Vec1<int16_t> b) {
440 return Vec1<int16_t>(
441 static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
442 }
443
444 // ------------------------------ Average
445
446 // Returns (a + b + 1) / 2
447
AverageRound(const Vec1<uint8_t> a,const Vec1<uint8_t> b)448 HWY_INLINE Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
449 const Vec1<uint8_t> b) {
450 return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
451 }
AverageRound(const Vec1<uint16_t> a,const Vec1<uint16_t> b)452 HWY_INLINE Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
453 const Vec1<uint16_t> b) {
454 return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
455 }
456
457 // ------------------------------ Absolute value
458
459 template <typename T>
Abs(const Vec1<T> a)460 HWY_INLINE Vec1<T> Abs(const Vec1<T> a) {
461 const T i = a.raw;
462 return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
463 }
Abs(const Vec1<float> a)464 HWY_INLINE Vec1<float> Abs(const Vec1<float> a) {
465 return Vec1<float>(std::abs(a.raw));
466 }
Abs(const Vec1<double> a)467 HWY_INLINE Vec1<double> Abs(const Vec1<double> a) {
468 return Vec1<double>(std::abs(a.raw));
469 }
470
471 // ------------------------------ min/max
472
473 template <typename T, HWY_IF_NOT_FLOAT(T)>
Min(const Vec1<T> a,const Vec1<T> b)474 HWY_INLINE Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
475 return Vec1<T>(HWY_MIN(a.raw, b.raw));
476 }
477
478 template <typename T, HWY_IF_FLOAT(T)>
Min(const Vec1<T> a,const Vec1<T> b)479 HWY_INLINE Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
480 if (std::isnan(a.raw)) return b;
481 if (std::isnan(b.raw)) return a;
482 return Vec1<T>(HWY_MIN(a.raw, b.raw));
483 }
484
485 template <typename T, HWY_IF_NOT_FLOAT(T)>
Max(const Vec1<T> a,const Vec1<T> b)486 HWY_INLINE Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
487 return Vec1<T>(HWY_MAX(a.raw, b.raw));
488 }
489
490 template <typename T, HWY_IF_FLOAT(T)>
Max(const Vec1<T> a,const Vec1<T> b)491 HWY_INLINE Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
492 if (std::isnan(a.raw)) return b;
493 if (std::isnan(b.raw)) return a;
494 return Vec1<T>(HWY_MAX(a.raw, b.raw));
495 }
496
497 // ------------------------------ Floating-point negate
498
499 template <typename T, HWY_IF_FLOAT(T)>
Neg(const Vec1<T> v)500 HWY_INLINE Vec1<T> Neg(const Vec1<T> v) {
501 return Xor(v, SignBit(Sisd<T>()));
502 }
503
504 template <typename T, HWY_IF_NOT_FLOAT(T)>
Neg(const Vec1<T> v)505 HWY_INLINE Vec1<T> Neg(const Vec1<T> v) {
506 return Zero(Sisd<T>()) - v;
507 }
508
509 // ------------------------------ mul/div
510
511 template <typename T>
512 HWY_INLINE Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
513 if (hwy::IsFloat<T>()) {
514 return Vec1<T>(static_cast<T>(double(a.raw) * b.raw));
515 } else if (hwy::IsSigned<T>()) {
516 return Vec1<T>(static_cast<T>(int64_t(a.raw) * b.raw));
517 } else {
518 return Vec1<T>(static_cast<T>(uint64_t(a.raw) * b.raw));
519 }
520 }
521
522 template <typename T>
523 HWY_INLINE Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
524 return Vec1<T>(a.raw / b.raw);
525 }
526
527 // Returns the upper 16 bits of a * b in each lane.
MulHigh(const Vec1<int16_t> a,const Vec1<int16_t> b)528 HWY_INLINE Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
529 return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
530 }
MulHigh(const Vec1<uint16_t> a,const Vec1<uint16_t> b)531 HWY_INLINE Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a,
532 const Vec1<uint16_t> b) {
533 // Cast to uint32_t first to prevent overflow. Otherwise the result of
534 // uint16_t * uint16_t is in "int" which may overflow. In practice the result
535 // is the same but this way it is also defined.
536 return Vec1<uint16_t>(static_cast<uint16_t>(
537 (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
538 }
539
540 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
MulEven(const Vec1<int32_t> a,const Vec1<int32_t> b)541 HWY_INLINE Vec1<int64_t> MulEven(const Vec1<int32_t> a, const Vec1<int32_t> b) {
542 const int64_t a64 = a.raw;
543 return Vec1<int64_t>(a64 * b.raw);
544 }
MulEven(const Vec1<uint32_t> a,const Vec1<uint32_t> b)545 HWY_INLINE Vec1<uint64_t> MulEven(const Vec1<uint32_t> a,
546 const Vec1<uint32_t> b) {
547 const uint64_t a64 = a.raw;
548 return Vec1<uint64_t>(a64 * b.raw);
549 }
550
551 // Approximate reciprocal
ApproximateReciprocal(const Vec1<float> v)552 HWY_INLINE Vec1<float> ApproximateReciprocal(const Vec1<float> v) {
553 // Zero inputs are allowed, but callers are responsible for replacing the
554 // return value with something else (typically using IfThenElse). This check
555 // avoids a ubsan error. The return value is arbitrary.
556 if (v.raw == 0.0f) return Vec1<float>(0.0f);
557 return Vec1<float>(1.0f / v.raw);
558 }
559
560 // Absolute value of difference.
AbsDiff(const Vec1<float> a,const Vec1<float> b)561 HWY_INLINE Vec1<float> AbsDiff(const Vec1<float> a, const Vec1<float> b) {
562 return Abs(a - b);
563 }
564
565 // ------------------------------ Floating-point multiply-add variants
566
567 template <typename T>
MulAdd(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> add)568 HWY_INLINE Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x,
569 const Vec1<T> add) {
570 return mul * x + add;
571 }
572
573 template <typename T>
NegMulAdd(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> add)574 HWY_INLINE Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x,
575 const Vec1<T> add) {
576 return add - mul * x;
577 }
578
579 template <typename T>
MulSub(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> sub)580 HWY_INLINE Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x,
581 const Vec1<T> sub) {
582 return mul * x - sub;
583 }
584
585 template <typename T>
NegMulSub(const Vec1<T> mul,const Vec1<T> x,const Vec1<T> sub)586 HWY_INLINE Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x,
587 const Vec1<T> sub) {
588 return Neg(mul) * x - sub;
589 }
590
591 // ------------------------------ Floating-point square root
592
593 // Approximate reciprocal square root
ApproximateReciprocalSqrt(const Vec1<float> v)594 HWY_INLINE Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
595 float f = v.raw;
596 const float half = f * 0.5f;
597 uint32_t bits;
598 CopyBytes<4>(&f, &bits);
599 // Initial guess based on log2(f)
600 bits = 0x5F3759DF - (bits >> 1);
601 CopyBytes<4>(&bits, &f);
602 // One Newton-Raphson iteration
603 return Vec1<float>(f * (1.5f - (half * f * f)));
604 }
605
606 // Square root
Sqrt(const Vec1<float> v)607 HWY_INLINE Vec1<float> Sqrt(const Vec1<float> v) {
608 return Vec1<float>(std::sqrt(v.raw));
609 }
Sqrt(const Vec1<double> v)610 HWY_INLINE Vec1<double> Sqrt(const Vec1<double> v) {
611 return Vec1<double>(std::sqrt(v.raw));
612 }
613
614 // ------------------------------ Floating-point rounding
615
616 template <typename T>
Round(const Vec1<T> v)617 HWY_INLINE Vec1<T> Round(const Vec1<T> v) {
618 using TI = MakeSigned<T>;
619 if (!(Abs(v).raw < MantissaEnd<T>())) { // Huge or NaN
620 return v;
621 }
622 const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
623 const TI rounded = static_cast<TI>(v.raw + bias);
624 if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
625 // Round to even
626 if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
627 return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
628 }
629 return Vec1<T>(static_cast<T>(rounded));
630 }
631
632 // Round-to-nearest even.
NearestInt(const Vec1<float> v)633 HWY_INLINE Vec1<int32_t> NearestInt(const Vec1<float> v) {
634 using T = float;
635 using TI = int32_t;
636
637 const T abs = Abs(v).raw;
638 const bool signbit = std::signbit(v.raw);
639
640 if (!(abs < MantissaEnd<T>())) { // Huge or NaN
641 // Check if too large to cast or NaN
642 if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
643 return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
644 }
645 return Vec1<int32_t>(static_cast<TI>(v.raw));
646 }
647 const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
648 const TI rounded = static_cast<TI>(v.raw + bias);
649 if (rounded == 0) return Vec1<int32_t>(0);
650 // Round to even
651 if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
652 return Vec1<TI>(rounded - (signbit ? -1 : 1));
653 }
654 return Vec1<TI>(rounded);
655 }
656
657 template <typename T>
Trunc(const Vec1<T> v)658 HWY_INLINE Vec1<T> Trunc(const Vec1<T> v) {
659 using TI = MakeSigned<T>;
660 if (!(Abs(v).raw <= MantissaEnd<T>())) { // Huge or NaN
661 return v;
662 }
663 const TI truncated = static_cast<TI>(v.raw);
664 if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
665 return Vec1<T>(static_cast<T>(truncated));
666 }
667
668 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
669 class V>
Ceiling(const V v)670 V Ceiling(const V v) {
671 const Bits kExponentMask = (1ull << kExponentBits) - 1;
672 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
673 const Bits kBias = kExponentMask / 2;
674
675 Float f = v.raw;
676 const bool positive = f > Float(0.0);
677
678 Bits bits;
679 CopyBytes<sizeof(Bits)>(&v, &bits);
680
681 const int exponent =
682 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
683 // Already an integer.
684 if (exponent >= kMantissaBits) return v;
685 // |v| <= 1 => 0 or 1.
686 if (exponent < 0) return positive ? V(1) : V(-0.0);
687
688 const Bits mantissa_mask = kMantissaMask >> exponent;
689 // Already an integer
690 if ((bits & mantissa_mask) == 0) return v;
691
692 // Clear fractional bits and round up
693 if (positive) bits += (kMantissaMask + 1) >> exponent;
694 bits &= ~mantissa_mask;
695
696 CopyBytes<sizeof(Bits)>(&bits, &f);
697 return V(f);
698 }
699
700 template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
701 class V>
Floor(const V v)702 V Floor(const V v) {
703 const Bits kExponentMask = (1ull << kExponentBits) - 1;
704 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
705 const Bits kBias = kExponentMask / 2;
706
707 Float f = v.raw;
708 const bool negative = f < Float(0.0);
709
710 Bits bits;
711 CopyBytes<sizeof(Bits)>(&v, &bits);
712
713 const int exponent =
714 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
715 // Already an integer.
716 if (exponent >= kMantissaBits) return v;
717 // |v| <= 1 => -1 or 0.
718 if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
719
720 const Bits mantissa_mask = kMantissaMask >> exponent;
721 // Already an integer
722 if ((bits & mantissa_mask) == 0) return v;
723
724 // Clear fractional bits and round down
725 if (negative) bits += (kMantissaMask + 1) >> exponent;
726 bits &= ~mantissa_mask;
727
728 CopyBytes<sizeof(Bits)>(&bits, &f);
729 return V(f);
730 }
731
732 // Toward +infinity, aka ceiling
Ceil(const Vec1<float> v)733 HWY_INLINE Vec1<float> Ceil(const Vec1<float> v) {
734 return Ceiling<float, uint32_t, 23, 8>(v);
735 }
Ceil(const Vec1<double> v)736 HWY_INLINE Vec1<double> Ceil(const Vec1<double> v) {
737 return Ceiling<double, uint64_t, 52, 11>(v);
738 }
739
740 // Toward -infinity, aka floor
Floor(const Vec1<float> v)741 HWY_INLINE Vec1<float> Floor(const Vec1<float> v) {
742 return Floor<float, uint32_t, 23, 8>(v);
743 }
Floor(const Vec1<double> v)744 HWY_INLINE Vec1<double> Floor(const Vec1<double> v) {
745 return Floor<double, uint64_t, 52, 11>(v);
746 }
747
748 // ================================================== COMPARE
749
750 template <typename T>
751 HWY_INLINE Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) {
752 return Mask1<T>::FromBool(a.raw == b.raw);
753 }
754
755 template <typename T>
TestBit(const Vec1<T> v,const Vec1<T> bit)756 HWY_INLINE Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) {
757 static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
758 return (v & bit) == bit;
759 }
760
761 template <typename T>
762 HWY_INLINE Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) {
763 return Mask1<T>::FromBool(a.raw < b.raw);
764 }
765 template <typename T>
766 HWY_INLINE Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) {
767 return Mask1<T>::FromBool(a.raw > b.raw);
768 }
769
770 template <typename T>
771 HWY_INLINE Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) {
772 return Mask1<T>::FromBool(a.raw <= b.raw);
773 }
774 template <typename T>
775 HWY_INLINE Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
776 return Mask1<T>::FromBool(a.raw >= b.raw);
777 }
778
779 // ================================================== MEMORY
780
781 // ------------------------------ Load
782
783 template <typename T>
Load(Sisd<T>,const T * HWY_RESTRICT aligned)784 HWY_INLINE Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
785 T t;
786 CopyBytes<sizeof(T)>(aligned, &t);
787 return Vec1<T>(t);
788 }
789
790 template <typename T>
LoadU(Sisd<T> d,const T * HWY_RESTRICT p)791 HWY_INLINE Vec1<T> LoadU(Sisd<T> d, const T* HWY_RESTRICT p) {
792 return Load(d, p);
793 }
794
795 // In some use cases, "load single lane" is sufficient; otherwise avoid this.
796 template <typename T>
LoadDup128(Sisd<T> d,const T * HWY_RESTRICT aligned)797 HWY_INLINE Vec1<T> LoadDup128(Sisd<T> d, const T* HWY_RESTRICT aligned) {
798 return Load(d, aligned);
799 }
800
801 // ------------------------------ Store
802
803 template <typename T>
Store(const Vec1<T> v,Sisd<T>,T * HWY_RESTRICT aligned)804 HWY_INLINE void Store(const Vec1<T> v, Sisd<T> /* tag */,
805 T* HWY_RESTRICT aligned) {
806 CopyBytes<sizeof(T)>(&v.raw, aligned);
807 }
808
809 template <typename T>
StoreU(const Vec1<T> v,Sisd<T> d,T * HWY_RESTRICT p)810 HWY_INLINE void StoreU(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT p) {
811 return Store(v, d, p);
812 }
813
814 // ------------------------------ StoreInterleaved3
815
StoreInterleaved3(const Vec1<uint8_t> v0,const Vec1<uint8_t> v1,const Vec1<uint8_t> v2,Sisd<uint8_t> d,uint8_t * HWY_RESTRICT unaligned)816 HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
817 const Vec1<uint8_t> v2, Sisd<uint8_t> d,
818 uint8_t* HWY_RESTRICT unaligned) {
819 StoreU(v0, d, unaligned + 0);
820 StoreU(v1, d, unaligned + 1);
821 StoreU(v2, d, unaligned + 2);
822 }
823
StoreInterleaved4(const Vec1<uint8_t> v0,const Vec1<uint8_t> v1,const Vec1<uint8_t> v2,const Vec1<uint8_t> v3,Sisd<uint8_t> d,uint8_t * HWY_RESTRICT unaligned)824 HWY_API void StoreInterleaved4(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
825 const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
826 Sisd<uint8_t> d,
827 uint8_t* HWY_RESTRICT unaligned) {
828 StoreU(v0, d, unaligned + 0);
829 StoreU(v1, d, unaligned + 1);
830 StoreU(v2, d, unaligned + 2);
831 StoreU(v3, d, unaligned + 3);
832 }
833
834 // ------------------------------ Stream
835
836 template <typename T>
Stream(const Vec1<T> v,Sisd<T> d,T * HWY_RESTRICT aligned)837 HWY_INLINE void Stream(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT aligned) {
838 return Store(v, d, aligned);
839 }
840
841 // ------------------------------ Scatter
842
843 template <typename T, typename Offset>
ScatterOffset(Vec1<T> v,Sisd<T> d,T * base,const Vec1<Offset> offset)844 HWY_INLINE void ScatterOffset(Vec1<T> v, Sisd<T> d, T* base,
845 const Vec1<Offset> offset) {
846 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
847 uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
848 return Store(v, d, reinterpret_cast<T*>(base8));
849 }
850
851 template <typename T, typename Index>
ScatterIndex(Vec1<T> v,Sisd<T> d,T * HWY_RESTRICT base,const Vec1<Index> index)852 HWY_INLINE void ScatterIndex(Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT base,
853 const Vec1<Index> index) {
854 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
855 return Store(v, d, base + index.raw);
856 }
857
858 // ------------------------------ Gather
859
860 template <typename T, typename Offset>
GatherOffset(Sisd<T> d,const T * base,const Vec1<Offset> offset)861 HWY_INLINE Vec1<T> GatherOffset(Sisd<T> d, const T* base,
862 const Vec1<Offset> offset) {
863 static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
864 const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
865 return Load(d, reinterpret_cast<const T*>(addr));
866 }
867
868 template <typename T, typename Index>
GatherIndex(Sisd<T> d,const T * HWY_RESTRICT base,const Vec1<Index> index)869 HWY_INLINE Vec1<T> GatherIndex(Sisd<T> d, const T* HWY_RESTRICT base,
870 const Vec1<Index> index) {
871 static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
872 return Load(d, base + index.raw);
873 }
874
875 // ================================================== CONVERT
876
877 // ConvertTo and DemoteTo with floating-point input and integer output truncate
878 // (rounding toward zero).
879
880 template <typename FromT, typename ToT>
PromoteTo(Sisd<ToT>,Vec1<FromT> from)881 HWY_INLINE Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
882 static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
883 // For bits Y > X, floatX->floatY and intX->intY are always representable.
884 return Vec1<ToT>(static_cast<ToT>(from.raw));
885 }
886
887 template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
DemoteTo(Sisd<ToT>,Vec1<FromT> from)888 HWY_INLINE Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
889 static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
890
891 // Prevent ubsan errors when converting float to narrower integer/float
892 if (std::isinf(from.raw) ||
893 std::fabs(from.raw) > static_cast<FromT>(HighestValue<ToT>())) {
894 return Vec1<ToT>(std::signbit(from.raw) ? LowestValue<ToT>()
895 : HighestValue<ToT>());
896 }
897 return Vec1<ToT>(static_cast<ToT>(from.raw));
898 }
899
900 template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
DemoteTo(Sisd<ToT>,Vec1<FromT> from)901 HWY_INLINE Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
902 static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
903
904 // Int to int: choose closest value in ToT to `from` (avoids UB)
905 from.raw = std::min<FromT>(std::max<FromT>(LimitsMin<ToT>(), from.raw),
906 LimitsMax<ToT>());
907 return Vec1<ToT>(static_cast<ToT>(from.raw));
908 }
909
PromoteTo(Sisd<float>,const Vec1<float16_t> v)910 static HWY_INLINE Vec1<float> PromoteTo(Sisd<float> /* tag */,
911 const Vec1<float16_t> v) {
912 #if HWY_NATIVE_FLOAT16
913 uint16_t bits16;
914 CopyBytes<2>(&v.raw, &bits16);
915 #else
916 const uint16_t bits16 = v.raw.bits;
917 #endif
918 const uint32_t sign = bits16 >> 15;
919 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
920 const uint32_t mantissa = bits16 & 0x3FF;
921
922 // Subnormal or zero
923 if (biased_exp == 0) {
924 const float subnormal =
925 (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
926 return Vec1<float>(sign ? -subnormal : subnormal);
927 }
928
929 // Normalized: convert the representation directly (faster than ldexp/tables).
930 const uint32_t biased_exp32 = biased_exp + (127 - 15);
931 const uint32_t mantissa32 = mantissa << (23 - 10);
932 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
933 float out;
934 CopyBytes<4>(&bits32, &out);
935 return Vec1<float>(out);
936 }
937
DemoteTo(Sisd<float16_t>,const Vec1<float> v)938 static HWY_INLINE Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
939 const Vec1<float> v) {
940 uint32_t bits32;
941 CopyBytes<4>(&v.raw, &bits32);
942 const uint32_t sign = bits32 >> 31;
943 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
944 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
945
946 const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
947
948 // Tiny or zero => zero.
949 Vec1<float16_t> out;
950 if (exp < -24) {
951 #if HWY_NATIVE_FLOAT16
952 const uint16_t zero = 0;
953 CopyBytes<2>(&zero, &out.raw);
954 #else
955 out.raw.bits = 0;
956 #endif
957 return out;
958 }
959
960 uint32_t biased_exp16, mantissa16;
961
962 // exp = [-24, -15] => subnormal
963 if (exp < -14) {
964 biased_exp16 = 0;
965 const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
966 HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
967 mantissa16 = (1 << (10 - sub_exp)) + (mantissa32 >> (13 + sub_exp));
968 } else {
969 // exp = [-14, 15]
970 biased_exp16 = static_cast<uint32_t>(exp + 15);
971 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
972 mantissa16 = mantissa32 >> 13;
973 }
974
975 HWY_DASSERT(mantissa16 < 1024);
976 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
977 HWY_DASSERT(bits16 < 0x10000);
978 #if HWY_NATIVE_FLOAT16
979 const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
980 CopyBytes<2>(&narrowed, &out.raw);
981 #else
982 out.raw.bits = static_cast<uint16_t>(bits16);
983 #endif
984 return out;
985 }
986
987 template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
ConvertTo(Sisd<ToT>,Vec1<FromT> from)988 HWY_INLINE Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
989 static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
990 // float## -> int##: return closest representable value. We cannot exactly
991 // represent LimitsMax<ToT> in FromT, so use double.
992 const double f = static_cast<double>(from.raw);
993 if (std::isinf(from.raw) ||
994 std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
995 return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
996 : LimitsMax<ToT>());
997 }
998 return Vec1<ToT>(static_cast<ToT>(from.raw));
999 }
1000
1001 template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
ConvertTo(Sisd<ToT>,Vec1<FromT> from)1002 HWY_INLINE Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
1003 static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
1004 // int## -> float##: no check needed
1005 return Vec1<ToT>(static_cast<ToT>(from.raw));
1006 }
1007
U8FromU32(const Vec1<uint32_t> v)1008 HWY_INLINE Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
1009 return DemoteTo(Sisd<uint8_t>(), v);
1010 }
1011
1012 // ================================================== SWIZZLE
1013
1014 // Unsupported: Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle*,
1015 // UpperHalf - these require more than one lane and/or actual 128-bit vectors.
1016
1017 template <typename T>
GetLane(const Vec1<T> v)1018 HWY_INLINE T GetLane(const Vec1<T> v) {
1019 return v.raw;
1020 }
1021
1022 template <typename T>
LowerHalf(Vec1<T> v)1023 HWY_INLINE Vec1<T> LowerHalf(Vec1<T> v) {
1024 return v;
1025 }
1026
1027 // ------------------------------ Broadcast/splat any lane
1028
1029 template <int kLane, typename T>
Broadcast(const Vec1<T> v)1030 HWY_INLINE Vec1<T> Broadcast(const Vec1<T> v) {
1031 static_assert(kLane == 0, "Scalar only has one lane");
1032 return v;
1033 }
1034
1035 // ------------------------------ Shuffle bytes with variable indices
1036
1037 // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
1038 // indices in [0, sizeof(T)).
1039 template <typename T>
TableLookupBytes(const Vec1<T> in,const Vec1<T> from)1040 HWY_API Vec1<T> TableLookupBytes(const Vec1<T> in, const Vec1<T> from) {
1041 uint8_t in_bytes[sizeof(T)];
1042 uint8_t from_bytes[sizeof(T)];
1043 uint8_t out_bytes[sizeof(T)];
1044 CopyBytes<sizeof(T)>(&in, &in_bytes);
1045 CopyBytes<sizeof(T)>(&from, &from_bytes);
1046 for (size_t i = 0; i < sizeof(T); ++i) {
1047 out_bytes[i] = in_bytes[from_bytes[i]];
1048 }
1049 T out;
1050 CopyBytes<sizeof(T)>(&out_bytes, &out);
1051 return Vec1<T>{out};
1052 }
1053
1054 // ------------------------------ TableLookupLanes
1055
1056 // Returned by SetTableIndices for use by TableLookupLanes.
1057 template <typename T>
1058 struct Indices1 {
1059 int raw;
1060 };
1061
1062 template <typename T>
SetTableIndices(Sisd<T>,const int32_t * idx)1063 HWY_API Indices1<T> SetTableIndices(Sisd<T>, const int32_t* idx) {
1064 #if !defined(NDEBUG) || defined(ADDRESS_SANITIZER)
1065 HWY_DASSERT(idx[0] == 0);
1066 #endif
1067 return Indices1<T>{idx[0]};
1068 }
1069
1070 template <typename T>
TableLookupLanes(const Vec1<T> v,const Indices1<T>)1071 HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) {
1072 return v;
1073 }
1074
1075 // ------------------------------ Zip/unpack
1076
ZipLower(const Vec1<uint8_t> a,const Vec1<uint8_t> b)1077 HWY_INLINE Vec1<uint16_t> ZipLower(const Vec1<uint8_t> a,
1078 const Vec1<uint8_t> b) {
1079 return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t(b.raw) << 8) + a.raw));
1080 }
ZipLower(const Vec1<uint16_t> a,const Vec1<uint16_t> b)1081 HWY_INLINE Vec1<uint32_t> ZipLower(const Vec1<uint16_t> a,
1082 const Vec1<uint16_t> b) {
1083 return Vec1<uint32_t>((uint32_t(b.raw) << 16) + a.raw);
1084 }
ZipLower(const Vec1<uint32_t> a,const Vec1<uint32_t> b)1085 HWY_INLINE Vec1<uint64_t> ZipLower(const Vec1<uint32_t> a,
1086 const Vec1<uint32_t> b) {
1087 return Vec1<uint64_t>((uint64_t(b.raw) << 32) + a.raw);
1088 }
ZipLower(const Vec1<int8_t> a,const Vec1<int8_t> b)1089 HWY_INLINE Vec1<int16_t> ZipLower(const Vec1<int8_t> a, const Vec1<int8_t> b) {
1090 return Vec1<int16_t>(static_cast<int16_t>((int32_t(b.raw) << 8) + a.raw));
1091 }
ZipLower(const Vec1<int16_t> a,const Vec1<int16_t> b)1092 HWY_INLINE Vec1<int32_t> ZipLower(const Vec1<int16_t> a,
1093 const Vec1<int16_t> b) {
1094 return Vec1<int32_t>((int32_t(b.raw) << 16) + a.raw);
1095 }
ZipLower(const Vec1<int32_t> a,const Vec1<int32_t> b)1096 HWY_INLINE Vec1<int64_t> ZipLower(const Vec1<int32_t> a,
1097 const Vec1<int32_t> b) {
1098 return Vec1<int64_t>((int64_t(b.raw) << 32) + a.raw);
1099 }
1100
1101 // ------------------------------ Mask
1102
1103 template <typename T>
AllFalse(const Mask1<T> mask)1104 HWY_INLINE bool AllFalse(const Mask1<T> mask) {
1105 return mask.bits == 0;
1106 }
1107
1108 template <typename T>
AllTrue(const Mask1<T> mask)1109 HWY_INLINE bool AllTrue(const Mask1<T> mask) {
1110 return mask.bits != 0;
1111 }
1112
1113 template <typename T>
StoreMaskBits(const Mask1<T> mask,uint8_t * p)1114 HWY_INLINE size_t StoreMaskBits(const Mask1<T> mask, uint8_t* p) {
1115 *p = AllTrue(mask);
1116 return 1;
1117 }
1118 template <typename T>
CountTrue(const Mask1<T> mask)1119 HWY_INLINE size_t CountTrue(const Mask1<T> mask) {
1120 return mask.bits == 0 ? 0 : 1;
1121 }
1122
1123 template <typename T>
Compress(Vec1<T> v,const Mask1<T>)1124 HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
1125 // Upper lanes are undefined, so result is the same independent of mask.
1126 return v;
1127 }
1128
1129 // ------------------------------ CompressStore
1130
1131 template <typename T>
CompressStore(Vec1<T> v,const Mask1<T> mask,Sisd<T> d,T * HWY_RESTRICT aligned)1132 HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
1133 T* HWY_RESTRICT aligned) {
1134 Store(Compress(v, mask), d, aligned);
1135 return CountTrue(mask);
1136 }
1137
1138 // ------------------------------ Reductions
1139
1140 // Sum of all lanes, i.e. the only one.
1141 template <typename T>
SumOfLanes(const Vec1<T> v0)1142 HWY_INLINE Vec1<T> SumOfLanes(const Vec1<T> v0) {
1143 return v0;
1144 }
1145 template <typename T>
MinOfLanes(const Vec1<T> v)1146 HWY_INLINE Vec1<T> MinOfLanes(const Vec1<T> v) {
1147 return v;
1148 }
1149 template <typename T>
MaxOfLanes(const Vec1<T> v)1150 HWY_INLINE Vec1<T> MaxOfLanes(const Vec1<T> v) {
1151 return v;
1152 }
1153
1154 // ================================================== Operator wrapper
1155
1156 template <class V>
Add(V a,V b)1157 HWY_API V Add(V a, V b) {
1158 return a + b;
1159 }
1160 template <class V>
Sub(V a,V b)1161 HWY_API V Sub(V a, V b) {
1162 return a - b;
1163 }
1164
1165 template <class V>
Mul(V a,V b)1166 HWY_API V Mul(V a, V b) {
1167 return a * b;
1168 }
1169 template <class V>
Div(V a,V b)1170 HWY_API V Div(V a, V b) {
1171 return a / b;
1172 }
1173
1174 template <class V>
Shl(V a,V b)1175 V Shl(V a, V b) {
1176 return a << b;
1177 }
1178 template <class V>
Shr(V a,V b)1179 V Shr(V a, V b) {
1180 return a >> b;
1181 }
1182
1183 template <class V>
1184 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1185 return a == b;
1186 }
1187 template <class V>
1188 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1189 return a < b;
1190 }
1191
1192 template <class V>
1193 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1194 return a > b;
1195 }
1196 template <class V>
1197 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1198 return a >= b;
1199 }
1200
1201 template <class V>
1202 HWY_API auto Le(V a, V b) -> decltype(a == b) {
1203 return a <= b;
1204 }
1205
1206 // NOLINTNEXTLINE(google-readability-namespace-comments)
1207 } // namespace HWY_NAMESPACE
1208 } // namespace hwy
1209 HWY_AFTER_NAMESPACE();
1210