1 // Copyright 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <stddef.h>
16 #include <stdint.h>
17 #include <string.h>
18 
19 #undef HWY_TARGET_INCLUDE
20 #define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
21 #include "hwy/foreach_target.h"
22 #include "hwy/highway.h"
23 #include "hwy/tests/test_util-inl.h"
24 
25 HWY_BEFORE_NAMESPACE();
26 namespace hwy {
27 namespace HWY_NAMESPACE {
28 
29 struct TestShiftBytes {
30   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestShiftBytes31   HWY_NOINLINE void operator()(T /*unused*/, D d) {
32     // Scalar does not define Shift*Bytes.
33 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
34     const Repartition<uint8_t, D> du8;
35     const size_t N8 = Lanes(du8);
36 
37     // Zero remains zero
38     const auto v0 = Zero(d);
39     HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0));
40     HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, v0));
41     HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, v0));
42 
43     // Zero after shifting out the high/low byte
44     auto bytes = AllocateAligned<uint8_t>(N8);
45     std::fill(bytes.get(), bytes.get() + N8, 0);
46     bytes[N8 - 1] = 0x7F;
47     const auto vhi = BitCast(d, Load(du8, bytes.get()));
48     bytes[N8 - 1] = 0;
49     bytes[0] = 0x7F;
50     const auto vlo = BitCast(d, Load(du8, bytes.get()));
51     HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi));
52     HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, vhi));
53     HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, vlo));
54 
55     // Check expected result with Iota
56     const size_t N = Lanes(d);
57     auto in = AllocateAligned<T>(N);
58     const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get());
59     const auto v = BitCast(d, Iota(du8, 1));
60     Store(v, d, in.get());
61 
62     auto expected = AllocateAligned<T>(N);
63     uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
64 
65     const size_t kBlockSize = HWY_MIN(N8, 16);
66     for (size_t block = 0; block < N8; block += kBlockSize) {
67       expected_bytes[block] = 0;
68       memcpy(expected_bytes + block + 1, in_bytes + block, kBlockSize - 1);
69     }
70     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
71     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v));
72 
73     for (size_t block = 0; block < N8; block += kBlockSize) {
74       memcpy(expected_bytes + block, in_bytes + block + 1, kBlockSize - 1);
75       expected_bytes[block + kBlockSize - 1] = 0;
76     }
77     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v));
78 #else
79     (void)d;
80 #endif  // #if HWY_TARGET != HWY_SCALAR
81   }
82 };
83 
TestAllShiftBytes()84 HWY_NOINLINE void TestAllShiftBytes() {
85   ForIntegerTypes(ForPartialVectors<TestShiftBytes>());
86 }
87 
88 struct TestShiftLanes {
89   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestShiftLanes90   HWY_NOINLINE void operator()(T /*unused*/, D d) {
91     // Scalar does not define Shift*Lanes.
92 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
93     const auto v = Iota(d, T(1));
94     const size_t N = Lanes(d);
95     auto expected = AllocateAligned<T>(N);
96 
97     HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v));
98     HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(d, v));
99     HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(d, v));
100 
101     constexpr size_t kLanesPerBlock = 16 / sizeof(T);
102 
103     for (size_t i = 0; i < N; ++i) {
104       expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i);
105     }
106     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v));
107     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(d, v));
108 
109     for (size_t i = 0; i < N; ++i) {
110       const size_t mod = i % kLanesPerBlock;
111       expected[i] = mod == (kLanesPerBlock - 1) || i >= N - 1 ? T(0) : T(2 + i);
112     }
113     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(d, v));
114 #else
115     (void)d;
116 #endif  // #if HWY_TARGET != HWY_SCALAR
117   }
118 };
119 
TestAllShiftLanes()120 HWY_NOINLINE void TestAllShiftLanes() {
121   ForAllTypes(ForPartialVectors<TestShiftLanes>());
122 }
123 
124 template <typename D, int kLane>
125 struct TestBroadcastR {
operator ()hwy::HWY_NAMESPACE::TestBroadcastR126   HWY_NOINLINE void operator()() const {
127     using T = typename D::T;
128     const D d;
129     const size_t N = Lanes(d);
130     if (kLane >= N) return;
131     auto in_lanes = AllocateAligned<T>(N);
132     std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
133     const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
134     // Need to set within each 128-bit block
135     for (size_t block = 0; block < N; block += blockN) {
136       in_lanes[block + kLane] = static_cast<T>(block + 1);
137     }
138     const auto in = Load(d, in_lanes.get());
139     auto expected = AllocateAligned<T>(N);
140     for (size_t block = 0; block < N; block += blockN) {
141       for (size_t i = 0; i < blockN; ++i) {
142         expected[block + i] = T(block + 1);
143       }
144     }
145     HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
146 
147     TestBroadcastR<D, kLane - 1>()();
148   }
149 };
150 
151 template <class D>
152 struct TestBroadcastR<D, -1> {
operator ()hwy::HWY_NAMESPACE::TestBroadcastR153   void operator()() const {}
154 };
155 
156 struct TestBroadcast {
157   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestBroadcast158   HWY_NOINLINE void operator()(T /*unused*/, D d) {
159     TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
160   }
161 };
162 
TestAllBroadcast()163 HWY_NOINLINE void TestAllBroadcast() {
164   const ForPartialVectors<TestBroadcast> test;
165   // No u/i8.
166   test(uint16_t());
167   test(int16_t());
168   ForUIF3264(test);
169 }
170 
171 template <bool kFull>
172 struct ChooseTableSize {
173   template <typename T, typename DIdx>
174   using type = DIdx;
175 };
176 template <>
177 struct ChooseTableSize<true> {
178   template <typename T, typename DIdx>
179   using type = ScalableTag<T>;
180 };
181 
182 template <bool kFull>
183 struct TestTableLookupBytes {
184   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestTableLookupBytes185   HWY_NOINLINE void operator()(T /*unused*/, D d) {
186 #if HWY_TARGET != HWY_SCALAR
187     RandomState rng;
188     const typename ChooseTableSize<kFull>::template type<T, D> d_tbl;
189     const Repartition<uint8_t, decltype(d_tbl)> d_tbl8;
190     const size_t NT8 = Lanes(d_tbl8);
191 
192     const Repartition<uint8_t, D> d8;
193     const size_t N = Lanes(d);
194     const size_t N8 = Lanes(d8);
195 
196     // Random input bytes
197     auto in_bytes = AllocateAligned<uint8_t>(NT8);
198     for (size_t i = 0; i < NT8; ++i) {
199       in_bytes[i] = Random32(&rng) & 0xFF;
200     }
201     const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get()));
202 
203     // Enough test data; for larger vectors, upper lanes will be zero.
204     const uint8_t index_bytes_source[64] = {
205         // Same index as source, multiple outputs from same input,
206         // unused input (9), ascending/descending and nonconsecutive neighbors.
207         0,  2,  1, 2, 15, 12, 13, 14, 6,  7,  8,  5,  4,  3,  10, 11,
208         11, 10, 3, 4, 5,  8,  7,  6,  14, 13, 12, 15, 2,  1,  2,  0,
209         4,  3,  2, 2, 5,  6,  7,  7,  15, 15, 15, 15, 15, 15, 0,  1};
210     auto index_bytes = AllocateAligned<uint8_t>(N8);
211     const size_t max_index = HWY_MIN(N8, 16) - 1;
212     for (size_t i = 0; i < N8; ++i) {
213       index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
214       // Avoid asan error for partial vectors.
215       index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index));
216     }
217     const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
218 
219     auto expected = AllocateAligned<T>(N);
220     uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
221 
222     for (size_t block = 0; block < N8; block += 16) {
223       for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
224         const uint8_t index = index_bytes[block + i];
225         HWY_ASSERT(block + index < N8);  // indices were already capped to N8.
226         // For large vectors, the lane index may wrap around due to block.
227         expected_bytes[block + i] = in_bytes[(block & 0xFF) + index];
228       }
229     }
230     HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
231 
232     // Individually test zeroing each byte position.
233     for (size_t i = 0; i < N8; ++i) {
234       const uint8_t prev_expected = expected_bytes[i];
235       const uint8_t prev_index = index_bytes[i];
236       expected_bytes[i] = 0;
237 
238       const int idx = 0x80 + (int(Random32(&rng) & 7) << 4);
239       HWY_ASSERT(0x80 <= idx && idx < 256);
240       index_bytes[i] = static_cast<uint8_t>(idx);
241 
242       const auto indices =
243           Load(d, reinterpret_cast<const T*>(index_bytes.get()));
244       HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices));
245       expected_bytes[i] = prev_expected;
246       index_bytes[i] = prev_index;
247     }
248 #else
249     (void)d;
250 #endif
251   }
252 };
253 
TestAllTableLookupBytes()254 HWY_NOINLINE void TestAllTableLookupBytes() {
255   // Partial index, same-sized table.
256   ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>());
257 
258 // TODO(janwas): requires LMUL trunc/ext, which is not yet implemented.
259 #if HWY_TARGET != HWY_RVV
260   // Partial index, full-size table.
261   ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>());
262 #endif
263 }
264 
265 struct TestInterleaveLower {
266   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestInterleaveLower267   HWY_NOINLINE void operator()(T /*unused*/, D d) {
268     using TU = MakeUnsigned<T>;
269     const size_t N = Lanes(d);
270     auto even_lanes = AllocateAligned<T>(N);
271     auto odd_lanes = AllocateAligned<T>(N);
272     auto expected = AllocateAligned<T>(N);
273     for (size_t i = 0; i < N; ++i) {
274       even_lanes[i] = static_cast<T>(2 * i + 0);
275       odd_lanes[i] = static_cast<T>(2 * i + 1);
276     }
277     const auto even = Load(d, even_lanes.get());
278     const auto odd = Load(d, odd_lanes.get());
279 
280     const size_t blockN = HWY_MIN(16 / sizeof(T), N);
281     for (size_t i = 0; i < Lanes(d); ++i) {
282       const size_t block = i / blockN;
283       const size_t index = (i % blockN) + block * 2 * blockN;
284       expected[i] = static_cast<T>(index & LimitsMax<TU>());
285     }
286     HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
287     HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd));
288   }
289 };
290 
291 struct TestInterleaveUpper {
292   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestInterleaveUpper293   HWY_NOINLINE void operator()(T /*unused*/, D d) {
294     const size_t N = Lanes(d);
295     if (N == 1) return;
296     auto even_lanes = AllocateAligned<T>(N);
297     auto odd_lanes = AllocateAligned<T>(N);
298     auto expected = AllocateAligned<T>(N);
299     for (size_t i = 0; i < N; ++i) {
300       even_lanes[i] = static_cast<T>(2 * i + 0);
301       odd_lanes[i] = static_cast<T>(2 * i + 1);
302     }
303     const auto even = Load(d, even_lanes.get());
304     const auto odd = Load(d, odd_lanes.get());
305 
306     const size_t blockN = HWY_MIN(16 / sizeof(T), N);
307     for (size_t i = 0; i < Lanes(d); ++i) {
308       const size_t block = i / blockN;
309       expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
310     }
311     HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd));
312   }
313 };
314 
TestAllInterleave()315 HWY_NOINLINE void TestAllInterleave() {
316   // Not DemoteVectors because this cannot be supported by HWY_SCALAR.
317   ForAllTypes(ForShrinkableVectors<TestInterleaveLower>());
318   ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>());
319 }
320 
321 struct TestZipLower {
322   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestZipLower323   HWY_NOINLINE void operator()(T /*unused*/, D d) {
324     using WideT = MakeWide<T>;
325     static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
326     static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
327     const size_t N = Lanes(d);
328     auto even_lanes = AllocateAligned<T>(N);
329     auto odd_lanes = AllocateAligned<T>(N);
330     for (size_t i = 0; i < N; ++i) {
331       even_lanes[i] = static_cast<T>(2 * i + 0);
332       odd_lanes[i] = static_cast<T>(2 * i + 1);
333     }
334     const auto even = Load(d, even_lanes.get());
335     const auto odd = Load(d, odd_lanes.get());
336 
337     const Repartition<WideT, D> dw;
338     const size_t NW = Lanes(dw);
339     auto expected = AllocateAligned<WideT>(NW);
340     const size_t blockN = HWY_MIN(size_t(16) / sizeof(WideT), NW);
341 
342     for (size_t i = 0; i < NW; ++i) {
343       const size_t block = i / blockN;
344       // Value of least-significant lane in lo-vector.
345       const size_t lo = 2u * (i % blockN) + 4u * block * blockN;
346       const size_t kBits = sizeof(T) * 8;
347       expected[i] = static_cast<WideT>((static_cast<WideT>(lo + 1) << kBits) +
348                                        static_cast<WideT>(lo));
349     }
350     HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipLower(even, odd));
351     HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipLower(dw, even, odd));
352   }
353 };
354 
355 struct TestZipUpper {
356   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestZipUpper357   HWY_NOINLINE void operator()(T /*unused*/, D d) {
358     using WideT = MakeWide<T>;
359     static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
360     static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
361     const size_t N = Lanes(d);
362     if (N < 16 / sizeof(T)) return;
363     auto even_lanes = AllocateAligned<T>(N);
364     auto odd_lanes = AllocateAligned<T>(N);
365     for (size_t i = 0; i < Lanes(d); ++i) {
366       even_lanes[i] = static_cast<T>(2 * i + 0);
367       odd_lanes[i] = static_cast<T>(2 * i + 1);
368     }
369     const auto even = Load(d, even_lanes.get());
370     const auto odd = Load(d, odd_lanes.get());
371 
372         const Repartition<WideT, D> dw;
373     const size_t NW = Lanes(dw);
374     auto expected = AllocateAligned<WideT>(NW);
375     const size_t blockN = HWY_MIN(size_t(16) / sizeof(WideT), NW);
376 
377     for (size_t i = 0; i < NW; ++i) {
378       const size_t block = i / blockN;
379       const size_t lo = 2u * (i % blockN) + 4u * block * blockN;
380       const size_t kBits = sizeof(T) * 8;
381       expected[i] = static_cast<WideT>(
382           (static_cast<WideT>(lo + 2 * blockN + 1) << kBits) +
383           static_cast<WideT>(lo + 2 * blockN));
384     }
385     HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipUpper(dw, even, odd));
386   }
387 };
388 
TestAllZip()389 HWY_NOINLINE void TestAllZip() {
390   const ForDemoteVectors<TestZipLower> lower_unsigned;
391   // TODO(janwas): enable after LowerHalf available
392 #if HWY_TARGET != HWY_RVV
393   lower_unsigned(uint8_t());
394 #endif
395   lower_unsigned(uint16_t());
396 #if HWY_CAP_INTEGER64
397   lower_unsigned(uint32_t());  // generates u64
398 #endif
399 
400   const ForDemoteVectors<TestZipLower> lower_signed;
401 #if HWY_TARGET != HWY_RVV
402   lower_signed(int8_t());
403 #endif
404   lower_signed(int16_t());
405 #if HWY_CAP_INTEGER64
406   lower_signed(int32_t());  // generates i64
407 #endif
408 
409   const ForShrinkableVectors<TestZipUpper> upper_unsigned;
410 #if HWY_TARGET != HWY_RVV
411   upper_unsigned(uint8_t());
412 #endif
413   upper_unsigned(uint16_t());
414 #if HWY_CAP_INTEGER64
415   upper_unsigned(uint32_t());  // generates u64
416 #endif
417 
418   const ForShrinkableVectors<TestZipUpper> upper_signed;
419 #if HWY_TARGET != HWY_RVV
420   upper_signed(int8_t());
421 #endif
422   upper_signed(int16_t());
423 #if HWY_CAP_INTEGER64
424   upper_signed(int32_t());  // generates i64
425 #endif
426 
427   // No float - concatenating f32 does not result in a f64
428 }
429 
430 template <int kBytes>
431 struct TestCombineShiftRightBytesR {
432   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestCombineShiftRightBytesR433   HWY_NOINLINE void operator()(T t, D d) {
434 // Scalar does not define CombineShiftRightBytes.
435 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
436     const size_t kBlockSize = 16;
437     static_assert(kBytes < kBlockSize, "Shift count is per block");
438     const Repartition<uint8_t, D> d8;
439     const size_t N8 = Lanes(d8);
440     if (N8 < 16) return;
441     auto hi_bytes = AllocateAligned<uint8_t>(N8);
442     auto lo_bytes = AllocateAligned<uint8_t>(N8);
443     auto expected_bytes = AllocateAligned<uint8_t>(N8);
444     uint8_t combined[2 * kBlockSize];
445 
446     // Random inputs in each lane
447     RandomState rng;
448     for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
449       for (size_t i = 0; i < N8; ++i) {
450         hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
451         lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
452       }
453       for (size_t i = 0; i < N8; i += kBlockSize) {
454         CopyBytes<kBlockSize>(&lo_bytes[i], combined);
455         CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
456         CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]);
457       }
458 
459       const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
460       const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
461       const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
462       HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo));
463     }
464 
465     TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
466 #else
467     (void)t;
468     (void)d;
469 #endif  // #if HWY_TARGET != HWY_SCALAR
470   }
471 };
472 
473 template <int kLanes>
474 struct TestCombineShiftRightLanesR {
475   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestCombineShiftRightLanesR476   HWY_NOINLINE void operator()(T t, D d) {
477 // Scalar does not define CombineShiftRightBytes (needed for *Lanes).
478 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
479     const Repartition<uint8_t, D> d8;
480     const size_t N8 = Lanes(d8);
481     if (N8 < 16) return;
482 
483     auto hi_bytes = AllocateAligned<uint8_t>(N8);
484     auto lo_bytes = AllocateAligned<uint8_t>(N8);
485     auto expected_bytes = AllocateAligned<uint8_t>(N8);
486     const size_t kBlockSize = 16;
487     uint8_t combined[2 * kBlockSize];
488 
489     // Random inputs in each lane
490     RandomState rng;
491     for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
492       for (size_t i = 0; i < N8; ++i) {
493         hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
494         lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
495       }
496       for (size_t i = 0; i < N8; i += kBlockSize) {
497         CopyBytes<kBlockSize>(&lo_bytes[i], combined);
498         CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
499         CopyBytes<kBlockSize>(combined + kLanes * sizeof(T),
500                               &expected_bytes[i]);
501       }
502 
503       const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
504       const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
505       const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
506       HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo));
507     }
508 
509     TestCombineShiftRightLanesR<kLanes - 1>()(t, d);
510 #else
511     (void)t;
512     (void)d;
513 #endif  // #if HWY_TARGET != HWY_SCALAR
514   }
515 };
516 
517 template <>
518 struct TestCombineShiftRightBytesR<0> {
519   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestCombineShiftRightBytesR520   void operator()(T /*unused*/, D /*unused*/) {}
521 };
522 
523 template <>
524 struct TestCombineShiftRightLanesR<0> {
525   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestCombineShiftRightLanesR526   void operator()(T /*unused*/, D /*unused*/) {}
527 };
528 
529 struct TestCombineShiftRight {
530   template <class T, class D>
operator ()hwy::HWY_NAMESPACE::TestCombineShiftRight531   HWY_NOINLINE void operator()(T t, D d) {
532     constexpr int kMaxBytes = HWY_MIN(16, int(MaxLanes(d) * sizeof(T)));
533     TestCombineShiftRightBytesR<kMaxBytes - 1>()(t, d);
534     TestCombineShiftRightLanesR<kMaxBytes / int(sizeof(T)) - 1>()(t, d);
535   }
536 };
537 
TestAllCombineShiftRight()538 HWY_NOINLINE void TestAllCombineShiftRight() {
539   // Need at least 2 lanes.
540   ForAllTypes(ForShrinkableVectors<TestCombineShiftRight>());
541 }
542 
543 class TestSpecialShuffle32 {
544  public:
545   template <class T, class D>
operator ()(T,D d)546   HWY_NOINLINE void operator()(T /*unused*/, D d) {
547     const auto v = Iota(d, 0);
548     VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__);
549     VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__);
550     VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__);
551     VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__);
552     VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__);
553   }
554 
555  private:
556   template <class D, class V>
VerifyLanes32(D d,VecArg<V> actual,const size_t i3,const size_t i2,const size_t i1,const size_t i0,const char * filename,const int line)557   HWY_NOINLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
558                                   const size_t i2, const size_t i1,
559                                   const size_t i0, const char* filename,
560                                   const int line) {
561     using T = TFromD<D>;
562     constexpr size_t kBlockN = 16 / sizeof(T);
563     const size_t N = Lanes(d);
564     if (N < 4) return;
565     auto expected = AllocateAligned<T>(N);
566     for (size_t block = 0; block < N; block += kBlockN) {
567       expected[block + 3] = static_cast<T>(block + i3);
568       expected[block + 2] = static_cast<T>(block + i2);
569       expected[block + 1] = static_cast<T>(block + i1);
570       expected[block + 0] = static_cast<T>(block + i0);
571     }
572     AssertVecEqual(d, expected.get(), actual, filename, line);
573   }
574 };
575 
576 class TestSpecialShuffle64 {
577  public:
578   template <class T, class D>
operator ()(T,D d)579   HWY_NOINLINE void operator()(T /*unused*/, D d) {
580     const auto v = Iota(d, 0);
581     VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
582   }
583 
584  private:
585   template <class D, class V>
VerifyLanes64(D d,VecArg<V> actual,const size_t i1,const size_t i0,const char * filename,const int line)586   HWY_NOINLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
587                                   const size_t i0, const char* filename,
588                                   const int line) {
589     using T = TFromD<D>;
590     constexpr size_t kBlockN = 16 / sizeof(T);
591     const size_t N = Lanes(d);
592     if (N < 2) return;
593     auto expected = AllocateAligned<T>(N);
594     for (size_t block = 0; block < N; block += kBlockN) {
595       expected[block + 1] = static_cast<T>(block + i1);
596       expected[block + 0] = static_cast<T>(block + i0);
597     }
598     AssertVecEqual(d, expected.get(), actual, filename, line);
599   }
600 };
601 
TestAllSpecialShuffles()602 HWY_NOINLINE void TestAllSpecialShuffles() {
603   const ForGE128Vectors<TestSpecialShuffle32> test32;
604   test32(uint32_t());
605   test32(int32_t());
606   test32(float());
607 
608 #if HWY_CAP_INTEGER64
609   const ForGE128Vectors<TestSpecialShuffle64> test64;
610   test64(uint64_t());
611   test64(int64_t());
612 #endif
613 
614 #if HWY_CAP_FLOAT64
615   const ForGE128Vectors<TestSpecialShuffle64> test_d;
616   test_d(double());
617 #endif
618 }
619 
620 // NOLINTNEXTLINE(google-readability-namespace-comments)
621 }  // namespace HWY_NAMESPACE
622 }  // namespace hwy
623 HWY_AFTER_NAMESPACE();
624 
625 #if HWY_ONCE
626 
627 namespace hwy {
628 HWY_BEFORE_TEST(HwyBlockwiseTest);
629 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllShiftBytes);
630 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllShiftLanes);
631 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast);
632 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytes);
633 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
634 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZip);
635 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllCombineShiftRight);
636 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);
637 }  // namespace hwy
638 
639 // Ought not to be necessary, but without this, no tests run on RVV.
main(int argc,char ** argv)640 int main(int argc, char** argv) {
641   ::testing::InitGoogleTest(&argc, argv);
642   return RUN_ALL_TESTS();
643 }
644 
645 #endif
646