1 /* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MIN_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MIN_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/core/i_min.h>
17 #include <simdpp/core/extract.h>
18 #include <simdpp/core/move_l.h>
19 #include <simdpp/core/make_uint.h>
20 #include <simdpp/detail/mem_block.h>
21 #include <limits>
22
23 namespace simdpp {
24 namespace SIMDPP_ARCH_NAMESPACE {
25 namespace detail {
26 namespace insn {
27
28 static SIMDPP_INL
i_reduce_min(const uint8x16 & a)29 uint8_t i_reduce_min(const uint8x16& a)
30 {
31 #if SIMDPP_USE_NULL
32 uint8_t r = a.el(0);
33 for (unsigned i = 0; i < a.length; i++) {
34 r = r < a.el(i) ? r : a.el(i);
35 }
36 return r;
37 #elif SIMDPP_USE_NEON64
38 return vminvq_u8(a.native());
39 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
40 uint8x16 r = min(a, move16_l<8>(a));
41 r = min(r, move16_l<4>(r));
42 r = min(r, move16_l<2>(r));
43 r = min(r, move16_l<1>(r));
44 return extract<0>(r);
45 #endif
46 }
47
48 #if SIMDPP_USE_AVX2
49 static SIMDPP_INL
i_reduce_min(const uint8<32> & a)50 uint8_t i_reduce_min(const uint8<32>& a)
51 {
52 uint8x16 r = detail::extract128<0>(a);
53 r = min(r, detail::extract128<1>(a));
54 return i_reduce_min(r);
55 }
56 #endif
57
58 #if SIMDPP_USE_AVX512BW
i_reduce_min(const uint8<64> & a)59 SIMDPP_INL uint8_t i_reduce_min(const uint8<64>& a)
60 {
61 uint8<32> r = detail::extract256<0>(a);
62 r = min(r, detail::extract256<1>(a));
63 return i_reduce_min(r);
64 }
65 #endif
66
67 template<unsigned N>
i_reduce_min(const uint8<N> & a)68 SIMDPP_INL uint8_t i_reduce_min(const uint8<N>& a)
69 {
70 #if SIMDPP_USE_NULL
71 uint8_t r = std::numeric_limits<uint8_t>::max();
72 for (unsigned j = 0; j < a.vec_length; ++j) {
73 for (unsigned i = 0; i < a.base_length; i++) {
74 r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
75 }
76 }
77 return r;
78 #else
79 uint8v r = a.vec(0);
80 for (unsigned j = 1; j < a.vec_length; ++j) {
81 r = min(r, a.vec(j));
82 }
83 return i_reduce_min(r);
84 #endif
85 }
86
87 // -----------------------------------------------------------------------------
88
89 static SIMDPP_INL
i_reduce_min(const int8x16 & a)90 int8_t i_reduce_min(const int8x16& a)
91 {
92 #if SIMDPP_USE_NULL
93 int8_t r = a.el(0);
94 for (unsigned i = 0; i < a.length; i++) {
95 r = r < a.el(i) ? r : a.el(i);
96 }
97 return r;
98 #elif SIMDPP_USE_NEON64
99 return vminvq_s8(a.native());
100 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
101 int8x16 r = min(a, move16_l<8>(a));
102 r = min(r, move16_l<4>(r));
103 r = min(r, move16_l<2>(r));
104 r = min(r, move16_l<1>(r));
105 return extract<0>(r);
106 #elif SIMDPP_USE_SSE2
107 // no instruction for int8 min available, only for uint8
108 uint8x16 ca = bit_xor(a, 0x80);
109 return i_reduce_min(ca) ^ 0x80;
110 #endif
111 }
112
113 #if SIMDPP_USE_AVX2
114 static SIMDPP_INL
i_reduce_min(const int8x32 & a)115 int8_t i_reduce_min(const int8x32& a)
116 {
117 int8x16 r = detail::extract128<0>(a);
118 r = min(r, detail::extract128<1>(a));
119 return i_reduce_min(r);
120 }
121 #endif
122
123 #if SIMDPP_USE_AVX512BW
i_reduce_min(const int8<64> & a)124 SIMDPP_INL int8_t i_reduce_min(const int8<64>& a)
125 {
126 int8<32> r = detail::extract256<0>(a);
127 r = min(r, detail::extract256<1>(a));
128 return i_reduce_min(r);
129 }
130 #endif
131
132 template<unsigned N>
i_reduce_min(const int8<N> & a)133 SIMDPP_INL int8_t i_reduce_min(const int8<N>& a)
134 {
135 #if SIMDPP_USE_NULL
136 int8_t r = std::numeric_limits<int8_t>::max();
137 for (unsigned j = 0; j < a.vec_length; ++j) {
138 for (unsigned i = 0; i < a.base_length; i++) {
139 r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
140 }
141 }
142 return r;
143 #elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
144 // no instruction for int8 min available, only for uint8
145 uint8x16 r = bit_xor(a.vec(0), 0x80);
146 for (unsigned j = 1; j < a.vec_length; ++j) {
147 uint8x16 ca = bit_xor(a.vec(j), 0x80);
148 r = min(r, ca);
149 }
150 return i_reduce_min(r) ^ 0x80;
151 #else
152 int8v r = a.vec(0);
153 for (unsigned j = 1; j < a.vec_length; ++j) {
154 r = min(r, a.vec(j));
155 }
156 return i_reduce_min(r);
157 #endif
158 }
159
160 // -----------------------------------------------------------------------------
161 static SIMDPP_INL
162 int16_t i_reduce_min(const int16x8& a);
163
164 static SIMDPP_INL
i_reduce_min(const uint16x8 & a)165 uint16_t i_reduce_min(const uint16x8& a)
166 {
167 #if SIMDPP_USE_NULL
168 uint16_t r = a.el(0);
169 for (unsigned i = 0; i < a.length; i++) {
170 r = r < a.el(i) ? r : a.el(i);
171 }
172 return r;
173 #elif SIMDPP_USE_NEON64
174 return vminvq_u16(a.native());
175 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
176 uint16x8 r = min(a, move8_l<4>(a));
177 r = min(r, move8_l<2>(r));
178 r = min(r, move8_l<1>(r));
179 return extract<0>(r);
180 #elif SIMDPP_USE_SSE2
181 // no instruction for uint16 min available, only for int16
182 int16x8 ca = bit_xor(a, 0x8000);
183 return i_reduce_min(ca) ^ 0x8000;
184 #endif
185 }
186
187 #if SIMDPP_USE_AVX2
188 static SIMDPP_INL
i_reduce_min(const uint16x16 & a)189 uint16_t i_reduce_min(const uint16x16& a)
190 {
191 uint16x8 r = detail::extract128<0>(a);
192 r = min(r, detail::extract128<1>(a));
193 return i_reduce_min(r);
194 }
195 #endif
196
197 #if SIMDPP_USE_AVX512BW
i_reduce_min(const uint16<32> & a)198 SIMDPP_INL uint16_t i_reduce_min(const uint16<32>& a)
199 {
200 uint16<16> r = detail::extract256<0>(a);
201 r = min(r, detail::extract256<1>(a));
202 return i_reduce_min(r);
203 }
204 #endif
205
206 template<unsigned N>
i_reduce_min(const uint16<N> & a)207 SIMDPP_INL uint16_t i_reduce_min(const uint16<N>& a)
208 {
209 #if SIMDPP_USE_NULL
210 uint16_t r = std::numeric_limits<uint16_t>::max();
211 for (unsigned j = 0; j < a.vec_length; ++j) {
212 for (unsigned i = 0; i < a.base_length; i++) {
213 r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
214 }
215 }
216 return r;
217 #elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
218 // no instruction for uint16 min available, only for int16
219 int16x8 r = bit_xor(a.vec(0), 0x8000);
220 for (unsigned j = 1; j < a.vec_length; ++j) {
221 int16x8 ca = bit_xor(a.vec(j), 0x8000);
222 r = min(r, ca);
223 }
224 return i_reduce_min(r) ^ 0x8000;
225 #else
226 uint16v r = a.vec(0);
227 for (unsigned j = 1; j < a.vec_length; ++j) {
228 r = min(r, a.vec(j));
229 }
230 return i_reduce_min(r);
231 #endif
232 }
233
234 // -----------------------------------------------------------------------------
235
236 static SIMDPP_INL
i_reduce_min(const int16x8 & a)237 int16_t i_reduce_min(const int16x8& a)
238 {
239 #if SIMDPP_USE_NULL
240 int16_t r = a.el(0);
241 for (unsigned i = 0; i < a.length; i++) {
242 r = r < a.el(i) ? r : a.el(i);
243 }
244 return r;
245 #elif SIMDPP_USE_NEON64
246 return vminvq_s16(a.native());
247 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
248 int16x8 r = min(a, move8_l<4>(a));
249 r = min(r, move8_l<2>(r));
250 r = min(r, move8_l<1>(r));
251 return extract<0>(r);
252 #endif
253 }
254
255 #if SIMDPP_USE_AVX2
256 static SIMDPP_INL
i_reduce_min(const int16x16 & a)257 int16_t i_reduce_min(const int16x16& a)
258 {
259 int16x8 r = detail::extract128<0>(a);
260 r = min(r, detail::extract128<1>(a));
261 return i_reduce_min(r);
262 }
263 #endif
264
265 #if SIMDPP_USE_AVX512BW
i_reduce_min(const int16<32> & a)266 SIMDPP_INL int16_t i_reduce_min(const int16<32>& a)
267 {
268 int16<16> r = detail::extract256<0>(a);
269 r = min(r, detail::extract256<1>(a));
270 return i_reduce_min(r);
271 }
272 #endif
273
274 template<unsigned N>
i_reduce_min(const int16<N> & a)275 SIMDPP_INL int16_t i_reduce_min(const int16<N>& a)
276 {
277 #if SIMDPP_USE_NULL
278 int16_t r = std::numeric_limits<int16_t>::max();
279 for (unsigned j = 0; j < a.vec_length; ++j) {
280 for (unsigned i = 0; i < a.base_length; i++) {
281 r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
282 }
283 }
284 return r;
285 #else
286 int16v r = a.vec(0);
287 for (unsigned j = 1; j < a.vec_length; ++j) {
288 r = min(r, a.vec(j));
289 }
290 return i_reduce_min(r);
291 #endif
292 }
293
294 // -----------------------------------------------------------------------------
295
296 static SIMDPP_INL
i_reduce_min(const uint32x4 & a)297 uint32_t i_reduce_min(const uint32x4& a)
298 {
299 #if SIMDPP_USE_NULL
300 uint32_t r = a.el(0);
301 for (unsigned i = 0; i < a.length; i++) {
302 r = r < a.el(i) ? r : a.el(i);
303 }
304 return r;
305 #elif SIMDPP_USE_NEON64
306 return vminvq_u32(a.native());
307 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
308 uint32x4 r = min(a, move4_l<2>(a));
309 r = min(r, move4_l<1>(r));
310 return extract<0>(r);
311 #elif SIMDPP_USE_SSE2
312 mem_block<uint32x4> b = a;
313 uint32_t r = b[0];
314 for (unsigned i = 1; i < b.length; i++) {
315 r = r < b[i] ? r : b[i];
316 }
317 return r;
318 #endif
319 }
320
321 #if SIMDPP_USE_AVX2
322 static SIMDPP_INL
i_reduce_min(const uint32x8 & a)323 uint32_t i_reduce_min(const uint32x8& a)
324 {
325 uint32x4 r = detail::extract128<0>(a);
326 r = min(r, detail::extract128<1>(a));
327 r = min(r, move4_l<2>(r));
328 r = min(r, move4_l<1>(r));
329 return extract<0>(r);
330 }
331 #endif
332
333 #if SIMDPP_USE_AVX512F
334 static SIMDPP_INL
i_reduce_min(const uint32<16> & a)335 uint32_t i_reduce_min(const uint32<16>& a)
336 {
337 return i_reduce_min((uint32<8>)min(extract256<0>(a), extract256<1>(a)));
338 }
339 #endif
340
341 template<unsigned N>
i_reduce_min(const uint32<N> & a)342 SIMDPP_INL uint32_t i_reduce_min(const uint32<N>& a)
343 {
344 #if SIMDPP_USE_NULL
345 uint32_t r = std::numeric_limits<uint32_t>::max();
346 for (unsigned j = 0; j < a.vec_length; ++j) {
347 for (unsigned i = 0; i < a.base_length; i++) {
348 r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
349 }
350 }
351 return r;
352 #else
353 uint32v r = a.vec(0);
354 for (unsigned j = 1; j < a.vec_length; ++j) {
355 r = min(r, a.vec(j));
356 }
357 return i_reduce_min(r);
358 #endif
359 }
360
361 // -----------------------------------------------------------------------------
362
363 static SIMDPP_INL
i_reduce_min(const int32x4 & a)364 int32_t i_reduce_min(const int32x4& a)
365 {
366 #if SIMDPP_USE_NULL
367 int32_t r = a.el(0);
368 for (unsigned i = 1; i < a.length; i++) {
369 r = r < a.el(i) ? r : a.el(i);
370 }
371 return r;
372 #elif SIMDPP_USE_NEON64
373 return vminvq_s32(a.native());
374 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
375 int32x4 r = min(a, move4_l<2>(a));
376 r = min(r, move4_l<1>(r));
377 return extract<0>(r);
378 #endif
379 }
380
381 #if SIMDPP_USE_AVX2
382 static SIMDPP_INL
i_reduce_min(const int32x8 & a)383 int32_t i_reduce_min(const int32x8& a)
384 {
385 int32x4 r = detail::extract128<0>(a);
386 r = min(r, detail::extract128<1>(a));
387 r = min(r, move4_l<2>(r));
388 r = min(r, move4_l<1>(r));
389 return extract<0>(r);
390 }
391 #endif
392
393 #if SIMDPP_USE_AVX512F
394 static SIMDPP_INL
i_reduce_min(const int32<16> & a)395 int32_t i_reduce_min(const int32<16>& a)
396 {
397 return i_reduce_min((int32<8>)min(extract256<0>(a), extract256<1>(a)));
398 }
399 #endif
400
401 template<unsigned N>
i_reduce_min(const int32<N> & a)402 SIMDPP_INL int32_t i_reduce_min(const int32<N>& a)
403 {
404 #if SIMDPP_USE_NULL
405 int32_t r = std::numeric_limits<int32_t>::max();
406 for (unsigned j = 0; j < a.vec_length; ++j) {
407 for (unsigned i = 0; i < a.base_length; i++) {
408 r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
409 }
410 }
411 return r;
412 #else
413 int32v r = a.vec(0);
414 for (unsigned j = 1; j < a.vec_length; ++j) {
415 r = min(r, a.vec(j));
416 }
417 return i_reduce_min(r);
418 #endif
419 }
420
421 // -----------------------------------------------------------------------------
422
423 static SIMDPP_INL
i_reduce_min(const uint64x2 & a)424 uint64_t i_reduce_min(const uint64x2& a)
425 {
426 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
427 uint64x2 r = min(a, move2_l<1>(a));
428 return extract<0>(r);
429 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
430 mem_block<uint64x2> b = a;
431 return b[0] < b[1] ? b[0] : b[1];
432 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
433 uint64_t r = a.el(0);
434 for (unsigned i = 0; i < a.length; i++) {
435 r = r < a.el(i) ? r : a.el(i);
436 }
437 return r;
438 #else
439 return SIMDPP_NOT_IMPLEMENTED1(a);
440 #endif
441 }
442
443 #if SIMDPP_USE_AVX2
444 static SIMDPP_INL
i_reduce_min(const uint64x4 & a)445 uint64_t i_reduce_min(const uint64x4& a)
446 {
447 uint64x2 r = detail::extract128<0>(a);
448 r = min(r, detail::extract128<1>(a));
449 r = min(r, move2_l<1>(r));
450 return extract<0>(r);
451 }
452 #endif
453
454 #if SIMDPP_USE_AVX512F
455 static SIMDPP_INL
i_reduce_min(const uint64<8> & a)456 uint64_t i_reduce_min(const uint64<8>& a)
457 {
458 return i_reduce_min((uint64<4>)min(extract256<0>(a), extract256<1>(a)));
459 }
460 #endif
461
462 template<unsigned N>
i_reduce_min(const uint64<N> & a)463 SIMDPP_INL uint64_t i_reduce_min(const uint64<N>& a)
464 {
465 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
466 uint64v r = a.vec(0);
467 for (unsigned j = 1; j < a.vec_length; ++j) {
468 r = min(r, a.vec(j));
469 }
470 return i_reduce_min(r);
471 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
472 uint64_t r = std::numeric_limits<uint64_t>::max();
473 for (unsigned j = 0; j < a.vec_length; ++j) {
474 mem_block<uint64v> b = a.vec(j);
475 for (unsigned i = 0; i < a.base_length; i++) {
476 r = r < b[i] ? r : b[i];
477 }
478 }
479 return r;
480 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
481 uint64_t r = std::numeric_limits<uint64_t>::max();
482 for (unsigned j = 0; j < a.vec_length; ++j) {
483 for (unsigned i = 0; i < a.base_length; i++) {
484 r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
485 }
486 }
487 return r;
488 #else
489 return SIMDPP_NOT_IMPLEMENTED1(a);
490 #endif
491 }
492
493 // -----------------------------------------------------------------------------
494
495 static SIMDPP_INL
i_reduce_min(const int64x2 & a)496 int64_t i_reduce_min(const int64x2& a)
497 {
498 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
499 int64x2 r = min(a, move2_l<1>(a));
500 return extract<0>(r);
501 #elif SIMDPP_USE_SSE2
502 mem_block<int64x2> b = a;
503 return b[0] < b[1] ? b[0] : b[1];
504 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
505 int64_t r = a.el(0);
506 for (unsigned i = 0; i < a.length; i++) {
507 r = r < a.el(i) ? r : a.el(i);
508 }
509 return r;
510 #else
511 return SIMDPP_NOT_IMPLEMENTED1(a);
512 #endif
513 }
514
515 #if SIMDPP_USE_AVX2
516 static SIMDPP_INL
i_reduce_min(const int64x4 & a)517 int64_t i_reduce_min(const int64x4& a)
518 {
519 int64x2 r = detail::extract128<0>(a);
520 r = min(r, detail::extract128<1>(a));
521 r = min(r, move2_l<1>(r));
522 return extract<0>(r);
523 }
524 #endif
525
526 #if SIMDPP_USE_AVX512F
527 static SIMDPP_INL
i_reduce_min(const int64<8> & a)528 int64_t i_reduce_min(const int64<8>& a)
529 {
530 return i_reduce_min((int64<4>)min(extract256<0>(a), extract256<1>(a)));
531 }
532 #endif
533
534 template<unsigned N>
i_reduce_min(const int64<N> & a)535 SIMDPP_INL int64_t i_reduce_min(const int64<N>& a)
536 {
537 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
538 int64v r = a.vec(0);
539 for (unsigned j = 1; j < a.vec_length; ++j) {
540 r = min(r, a.vec(j));
541 }
542 return i_reduce_min(r);
543 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
544 int64_t r = std::numeric_limits<int64_t>::max();
545 for (unsigned j = 0; j < a.vec_length; ++j) {
546 mem_block<int64v> b = a.vec(j);
547 for (unsigned i = 0; i < a.base_length; i++) {
548 r = r < b[i] ? r : b[i];
549 }
550 }
551 return r;
552 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
553 int64_t r = std::numeric_limits<int64_t>::max();
554 for (unsigned j = 0; j < a.vec_length; ++j) {
555 for (unsigned i = 0; i < a.base_length; i++) {
556 r = r < a.vec(j).el(i) ? r : a.vec(j).el(i);
557 }
558 }
559 return r;
560 #else
561 return SIMDPP_NOT_IMPLEMENTED1(a);
562 #endif
563 }
564
565 // -----------------------------------------------------------------------------
566
567 } // namespace insn
568 } // namespace detail
569 } // namespace SIMDPP_ARCH_NAMESPACE
570 } // namespace simdpp
571
572 #endif
573
574