1 /*  Copyright (C) 2016  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MIN_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MIN_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/i_min.h>
17 #include <simdpp/core/extract.h>
18 #include <simdpp/core/move_l.h>
19 #include <simdpp/core/make_uint.h>
20 #include <simdpp/detail/mem_block.h>
21 #include <limits>
22 
23 namespace simdpp {
24 namespace SIMDPP_ARCH_NAMESPACE {
25 namespace detail {
26 namespace insn {
27 
28 static SIMDPP_INL
i_reduce_min(const uint8x16 & a)29 uint8_t i_reduce_min(const uint8x16& a)
30 {
31 #if SIMDPP_USE_NULL
32     uint8_t r = a.el(0);
33     for (unsigned i = 0; i < a.length; i++) {
34         r = r < a.el(i) ? r : a.el(i);
35     }
36     return r;
37 #elif SIMDPP_USE_NEON64
38     return vminvq_u8(a.native());
39 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
40     uint8x16 r = min(a, move16_l<8>(a));
41     r = min(r, move16_l<4>(r));
42     r = min(r, move16_l<2>(r));
43     r = min(r, move16_l<1>(r));
44     return extract<0>(r);
45 #endif
46 }
47 
48 #if SIMDPP_USE_AVX2
49 static SIMDPP_INL
i_reduce_min(const uint8<32> & a)50 uint8_t i_reduce_min(const uint8<32>& a)
51 {
52     uint8x16 r = detail::extract128<0>(a);
53     r = min(r, detail::extract128<1>(a));
54     return i_reduce_min(r);
55 }
56 #endif
57 
58 #if SIMDPP_USE_AVX512BW
i_reduce_min(const uint8<64> & a)59 SIMDPP_INL uint8_t i_reduce_min(const uint8<64>& a)
60 {
61     uint8<32> r = detail::extract256<0>(a);
62     r = min(r, detail::extract256<1>(a));
63     return i_reduce_min(r);
64 }
65 #endif
66 
67 template<unsigned N>
i_reduce_min(const uint8<N> & a)68 SIMDPP_INL uint8_t i_reduce_min(const uint8<N>& a)
69 {
70 #if SIMDPP_USE_NULL
71     uint8_t r = std::numeric_limits<uint8_t>::max();
72     for (unsigned j = 0; j < a.vec_length; ++j) {
73         for (unsigned i = 0; i < a.base_length; i++) {
74             r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
75         }
76     }
77     return r;
78 #else
79     uint8v r = a.vec(0);
80     for (unsigned j = 1; j < a.vec_length; ++j) {
81         r = min(r, a.vec(j));
82     }
83     return i_reduce_min(r);
84 #endif
85 }
86 
87 // -----------------------------------------------------------------------------
88 
89 static SIMDPP_INL
i_reduce_min(const int8x16 & a)90 int8_t i_reduce_min(const int8x16& a)
91 {
92 #if SIMDPP_USE_NULL
93     int8_t r = a.el(0);
94     for (unsigned i = 0; i < a.length; i++) {
95         r = r < a.el(i) ? r : a.el(i);
96     }
97     return r;
98 #elif SIMDPP_USE_NEON64
99     return vminvq_s8(a.native());
100 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
101     int8x16 r = min(a, move16_l<8>(a));
102     r = min(r, move16_l<4>(r));
103     r = min(r, move16_l<2>(r));
104     r = min(r, move16_l<1>(r));
105     return extract<0>(r);
106 #elif SIMDPP_USE_SSE2
107     // no instruction for int8 min available, only for uint8
108     uint8x16 ca = bit_xor(a, 0x80);
109     return i_reduce_min(ca) ^ 0x80;
110 #endif
111 }
112 
113 #if SIMDPP_USE_AVX2
114 static SIMDPP_INL
i_reduce_min(const int8x32 & a)115 int8_t i_reduce_min(const int8x32& a)
116 {
117     int8x16 r = detail::extract128<0>(a);
118     r = min(r, detail::extract128<1>(a));
119     return i_reduce_min(r);
120 }
121 #endif
122 
123 #if SIMDPP_USE_AVX512BW
i_reduce_min(const int8<64> & a)124 SIMDPP_INL int8_t i_reduce_min(const int8<64>& a)
125 {
126     int8<32> r = detail::extract256<0>(a);
127     r = min(r, detail::extract256<1>(a));
128     return i_reduce_min(r);
129 }
130 #endif
131 
132 template<unsigned N>
i_reduce_min(const int8<N> & a)133 SIMDPP_INL int8_t i_reduce_min(const int8<N>& a)
134 {
135 #if SIMDPP_USE_NULL
136     int8_t r = std::numeric_limits<int8_t>::max();
137     for (unsigned j = 0; j < a.vec_length; ++j) {
138         for (unsigned i = 0; i < a.base_length; i++) {
139             r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
140         }
141     }
142     return r;
143 #elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
144     // no instruction for int8 min available, only for uint8
145     uint8x16 r = bit_xor(a.vec(0), 0x80);
146     for (unsigned j = 1; j < a.vec_length; ++j) {
147         uint8x16 ca = bit_xor(a.vec(j), 0x80);
148         r = min(r, ca);
149     }
150     return i_reduce_min(r) ^ 0x80;
151 #else
152     int8v r = a.vec(0);
153     for (unsigned j = 1; j < a.vec_length; ++j) {
154         r = min(r, a.vec(j));
155     }
156     return i_reduce_min(r);
157 #endif
158 }
159 
160 // -----------------------------------------------------------------------------
161 static SIMDPP_INL
162 int16_t i_reduce_min(const int16x8& a);
163 
164 static SIMDPP_INL
i_reduce_min(const uint16x8 & a)165 uint16_t i_reduce_min(const uint16x8& a)
166 {
167 #if SIMDPP_USE_NULL
168     uint16_t r = a.el(0);
169     for (unsigned i = 0; i < a.length; i++) {
170         r = r < a.el(i) ? r : a.el(i);
171     }
172     return r;
173 #elif SIMDPP_USE_NEON64
174     return vminvq_u16(a.native());
175 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
176     uint16x8 r = min(a, move8_l<4>(a));
177     r = min(r, move8_l<2>(r));
178     r = min(r, move8_l<1>(r));
179     return extract<0>(r);
180 #elif SIMDPP_USE_SSE2
181     // no instruction for uint16 min available, only for int16
182     int16x8 ca = bit_xor(a, 0x8000);
183     return i_reduce_min(ca) ^ 0x8000;
184 #endif
185 }
186 
187 #if SIMDPP_USE_AVX2
188 static SIMDPP_INL
i_reduce_min(const uint16x16 & a)189 uint16_t i_reduce_min(const uint16x16& a)
190 {
191     uint16x8 r = detail::extract128<0>(a);
192     r = min(r, detail::extract128<1>(a));
193     return i_reduce_min(r);
194 }
195 #endif
196 
197 #if SIMDPP_USE_AVX512BW
i_reduce_min(const uint16<32> & a)198 SIMDPP_INL uint16_t i_reduce_min(const uint16<32>& a)
199 {
200     uint16<16> r = detail::extract256<0>(a);
201     r = min(r, detail::extract256<1>(a));
202     return i_reduce_min(r);
203 }
204 #endif
205 
206 template<unsigned N>
i_reduce_min(const uint16<N> & a)207 SIMDPP_INL uint16_t i_reduce_min(const uint16<N>& a)
208 {
209 #if SIMDPP_USE_NULL
210     uint16_t r = std::numeric_limits<uint16_t>::max();
211     for (unsigned j = 0; j < a.vec_length; ++j) {
212         for (unsigned i = 0; i < a.base_length; i++) {
213             r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
214         }
215     }
216     return r;
217 #elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
218     // no instruction for uint16 min available, only for int16
219     int16x8 r = bit_xor(a.vec(0), 0x8000);
220     for (unsigned j = 1; j < a.vec_length; ++j) {
221         int16x8 ca = bit_xor(a.vec(j), 0x8000);
222         r = min(r, ca);
223     }
224     return i_reduce_min(r) ^ 0x8000;
225 #else
226     uint16v r = a.vec(0);
227     for (unsigned j = 1; j < a.vec_length; ++j) {
228         r = min(r, a.vec(j));
229     }
230     return i_reduce_min(r);
231 #endif
232 }
233 
234 // -----------------------------------------------------------------------------
235 
236 static SIMDPP_INL
i_reduce_min(const int16x8 & a)237 int16_t i_reduce_min(const int16x8& a)
238 {
239 #if SIMDPP_USE_NULL
240     int16_t r = a.el(0);
241     for (unsigned i = 0; i < a.length; i++) {
242         r = r < a.el(i) ? r : a.el(i);
243     }
244     return r;
245 #elif SIMDPP_USE_NEON64
246     return vminvq_s16(a.native());
247 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
248     int16x8 r = min(a, move8_l<4>(a));
249     r = min(r, move8_l<2>(r));
250     r = min(r, move8_l<1>(r));
251     return extract<0>(r);
252 #endif
253 }
254 
255 #if SIMDPP_USE_AVX2
256 static SIMDPP_INL
i_reduce_min(const int16x16 & a)257 int16_t i_reduce_min(const int16x16& a)
258 {
259     int16x8 r = detail::extract128<0>(a);
260     r = min(r, detail::extract128<1>(a));
261     return i_reduce_min(r);
262 }
263 #endif
264 
265 #if SIMDPP_USE_AVX512BW
i_reduce_min(const int16<32> & a)266 SIMDPP_INL int16_t i_reduce_min(const int16<32>& a)
267 {
268     int16<16> r = detail::extract256<0>(a);
269     r = min(r, detail::extract256<1>(a));
270     return i_reduce_min(r);
271 }
272 #endif
273 
274 template<unsigned N>
i_reduce_min(const int16<N> & a)275 SIMDPP_INL int16_t i_reduce_min(const int16<N>& a)
276 {
277 #if SIMDPP_USE_NULL
278     int16_t r = std::numeric_limits<int16_t>::max();
279     for (unsigned j = 0; j < a.vec_length; ++j) {
280         for (unsigned i = 0; i < a.base_length; i++) {
281             r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
282         }
283     }
284     return r;
285 #else
286     int16v r = a.vec(0);
287     for (unsigned j = 1; j < a.vec_length; ++j) {
288         r = min(r, a.vec(j));
289     }
290     return i_reduce_min(r);
291 #endif
292 }
293 
294 // -----------------------------------------------------------------------------
295 
296 static SIMDPP_INL
i_reduce_min(const uint32x4 & a)297 uint32_t i_reduce_min(const uint32x4& a)
298 {
299 #if SIMDPP_USE_NULL
300     uint32_t r = a.el(0);
301     for (unsigned i = 0; i < a.length; i++) {
302         r = r < a.el(i) ? r : a.el(i);
303     }
304     return r;
305 #elif SIMDPP_USE_NEON64
306     return vminvq_u32(a.native());
307 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
308     uint32x4 r = min(a, move4_l<2>(a));
309     r = min(r, move4_l<1>(r));
310     return extract<0>(r);
311 #elif SIMDPP_USE_SSE2
312     mem_block<uint32x4> b = a;
313     uint32_t r = b[0];
314     for (unsigned i = 1; i < b.length; i++) {
315         r = r < b[i] ? r : b[i];
316     }
317     return r;
318 #endif
319 }
320 
321 #if SIMDPP_USE_AVX2
322 static SIMDPP_INL
i_reduce_min(const uint32x8 & a)323 uint32_t i_reduce_min(const uint32x8& a)
324 {
325     uint32x4 r = detail::extract128<0>(a);
326     r = min(r, detail::extract128<1>(a));
327     r = min(r, move4_l<2>(r));
328     r = min(r, move4_l<1>(r));
329     return extract<0>(r);
330 }
331 #endif
332 
333 #if SIMDPP_USE_AVX512F
334 static SIMDPP_INL
i_reduce_min(const uint32<16> & a)335 uint32_t i_reduce_min(const uint32<16>& a)
336 {
337     return i_reduce_min((uint32<8>)min(extract256<0>(a), extract256<1>(a)));
338 }
339 #endif
340 
341 template<unsigned N>
i_reduce_min(const uint32<N> & a)342 SIMDPP_INL uint32_t i_reduce_min(const uint32<N>& a)
343 {
344 #if SIMDPP_USE_NULL
345     uint32_t r = std::numeric_limits<uint32_t>::max();
346     for (unsigned j = 0; j < a.vec_length; ++j) {
347         for (unsigned i = 0; i < a.base_length; i++) {
348             r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
349         }
350     }
351     return r;
352 #else
353     uint32v r = a.vec(0);
354     for (unsigned j = 1; j < a.vec_length; ++j) {
355         r = min(r, a.vec(j));
356     }
357     return i_reduce_min(r);
358 #endif
359 }
360 
361 // -----------------------------------------------------------------------------
362 
363 static SIMDPP_INL
i_reduce_min(const int32x4 & a)364 int32_t i_reduce_min(const int32x4& a)
365 {
366 #if SIMDPP_USE_NULL
367     int32_t r = a.el(0);
368     for (unsigned i = 1; i < a.length; i++) {
369         r = r < a.el(i) ? r : a.el(i);
370     }
371     return r;
372 #elif SIMDPP_USE_NEON64
373     return vminvq_s32(a.native());
374 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
375     int32x4 r = min(a, move4_l<2>(a));
376     r = min(r, move4_l<1>(r));
377     return extract<0>(r);
378 #endif
379 }
380 
381 #if SIMDPP_USE_AVX2
382 static SIMDPP_INL
i_reduce_min(const int32x8 & a)383 int32_t i_reduce_min(const int32x8& a)
384 {
385     int32x4 r = detail::extract128<0>(a);
386     r = min(r, detail::extract128<1>(a));
387     r = min(r, move4_l<2>(r));
388     r = min(r, move4_l<1>(r));
389     return extract<0>(r);
390 }
391 #endif
392 
393 #if SIMDPP_USE_AVX512F
394 static SIMDPP_INL
i_reduce_min(const int32<16> & a)395 int32_t i_reduce_min(const int32<16>& a)
396 {
397     return i_reduce_min((int32<8>)min(extract256<0>(a), extract256<1>(a)));
398 }
399 #endif
400 
401 template<unsigned N>
i_reduce_min(const int32<N> & a)402 SIMDPP_INL int32_t i_reduce_min(const int32<N>& a)
403 {
404 #if SIMDPP_USE_NULL
405     int32_t r = std::numeric_limits<int32_t>::max();
406     for (unsigned j = 0; j < a.vec_length; ++j) {
407         for (unsigned i = 0; i < a.base_length; i++) {
408             r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
409         }
410     }
411     return r;
412 #else
413     int32v r = a.vec(0);
414     for (unsigned j = 1; j < a.vec_length; ++j) {
415         r = min(r, a.vec(j));
416     }
417     return i_reduce_min(r);
418 #endif
419 }
420 
421 // -----------------------------------------------------------------------------
422 
423 static SIMDPP_INL
i_reduce_min(const uint64x2 & a)424 uint64_t i_reduce_min(const uint64x2& a)
425 {
426 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
427     uint64x2 r = min(a, move2_l<1>(a));
428     return extract<0>(r);
429 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
430     mem_block<uint64x2> b = a;
431     return b[0] < b[1] ? b[0] : b[1];
432 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
433     uint64_t r = a.el(0);
434     for (unsigned i = 0; i < a.length; i++) {
435         r = r < a.el(i) ? r : a.el(i);
436     }
437     return r;
438 #else
439     return SIMDPP_NOT_IMPLEMENTED1(a);
440 #endif
441 }
442 
443 #if SIMDPP_USE_AVX2
444 static SIMDPP_INL
i_reduce_min(const uint64x4 & a)445 uint64_t i_reduce_min(const uint64x4& a)
446 {
447     uint64x2 r = detail::extract128<0>(a);
448     r = min(r, detail::extract128<1>(a));
449     r = min(r, move2_l<1>(r));
450     return extract<0>(r);
451 }
452 #endif
453 
454 #if SIMDPP_USE_AVX512F
455 static SIMDPP_INL
i_reduce_min(const uint64<8> & a)456 uint64_t i_reduce_min(const uint64<8>& a)
457 {
458     return i_reduce_min((uint64<4>)min(extract256<0>(a), extract256<1>(a)));
459 }
460 #endif
461 
462 template<unsigned N>
i_reduce_min(const uint64<N> & a)463 SIMDPP_INL uint64_t i_reduce_min(const uint64<N>& a)
464 {
465 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
466     uint64v r = a.vec(0);
467     for (unsigned j = 1; j < a.vec_length; ++j) {
468         r = min(r, a.vec(j));
469     }
470     return i_reduce_min(r);
471 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
472     uint64_t r = std::numeric_limits<uint64_t>::max();
473     for (unsigned j = 0; j < a.vec_length; ++j) {
474         mem_block<uint64v> b = a.vec(j);
475         for (unsigned i = 0; i < a.base_length; i++) {
476             r = r < b[i] ? r : b[i];
477         }
478     }
479     return r;
480 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
481     uint64_t r = std::numeric_limits<uint64_t>::max();
482     for (unsigned j = 0; j < a.vec_length; ++j) {
483         for (unsigned i = 0; i < a.base_length; i++) {
484             r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
485         }
486     }
487     return r;
488 #else
489     return SIMDPP_NOT_IMPLEMENTED1(a);
490 #endif
491 }
492 
493 // -----------------------------------------------------------------------------
494 
495 static SIMDPP_INL
i_reduce_min(const int64x2 & a)496 int64_t i_reduce_min(const int64x2& a)
497 {
498 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
499     int64x2 r = min(a, move2_l<1>(a));
500     return extract<0>(r);
501 #elif SIMDPP_USE_SSE2
502     mem_block<int64x2> b = a;
503     return b[0] < b[1] ? b[0] : b[1];
504 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
505     int64_t r = a.el(0);
506     for (unsigned i = 0; i < a.length; i++) {
507         r = r < a.el(i) ? r : a.el(i);
508     }
509     return r;
510 #else
511     return SIMDPP_NOT_IMPLEMENTED1(a);
512 #endif
513 }
514 
515 #if SIMDPP_USE_AVX2
516 static SIMDPP_INL
i_reduce_min(const int64x4 & a)517 int64_t i_reduce_min(const int64x4& a)
518 {
519     int64x2 r = detail::extract128<0>(a);
520     r = min(r, detail::extract128<1>(a));
521     r = min(r, move2_l<1>(r));
522     return extract<0>(r);
523 }
524 #endif
525 
526 #if SIMDPP_USE_AVX512F
527 static SIMDPP_INL
i_reduce_min(const int64<8> & a)528 int64_t i_reduce_min(const int64<8>& a)
529 {
530     return i_reduce_min((int64<4>)min(extract256<0>(a), extract256<1>(a)));
531 }
532 #endif
533 
534 template<unsigned N>
i_reduce_min(const int64<N> & a)535 SIMDPP_INL int64_t i_reduce_min(const int64<N>& a)
536 {
537 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
538     int64v r = a.vec(0);
539     for (unsigned j = 1; j < a.vec_length; ++j) {
540         r = min(r, a.vec(j));
541     }
542     return i_reduce_min(r);
543 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
544     int64_t r = std::numeric_limits<int64_t>::max();
545     for (unsigned j = 0; j < a.vec_length; ++j) {
546         mem_block<int64v> b = a.vec(j);
547         for (unsigned i = 0; i < a.base_length; i++) {
548             r = r < b[i] ? r : b[i];
549         }
550     }
551     return r;
552 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
553     int64_t r = std::numeric_limits<int64_t>::max();
554     for (unsigned j = 0; j < a.vec_length; ++j) {
555         for (unsigned i = 0; i < a.base_length; i++) {
556             r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
557         }
558     }
559     return r;
560 #else
561     return SIMDPP_NOT_IMPLEMENTED1(a);
562 #endif
563 }
564 
565 // -----------------------------------------------------------------------------
566 
567 } // namespace insn
568 } // namespace detail
569 } // namespace SIMDPP_ARCH_NAMESPACE
570 } // namespace simdpp
571 
572 #endif
573 
574