1 /*  Copyright (C) 2016  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MAX_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MAX_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/i_max.h>
17 #include <simdpp/core/extract.h>
18 #include <simdpp/core/move_l.h>
19 #include <simdpp/core/make_uint.h>
20 #include <simdpp/detail/mem_block.h>
21 #include <simdpp/detail/extract128.h>
22 #include <limits>
23 
24 namespace simdpp {
25 namespace SIMDPP_ARCH_NAMESPACE {
26 namespace detail {
27 namespace insn {
28 
29 static SIMDPP_INL
i_reduce_max(const uint8x16 & a)30 uint8_t i_reduce_max(const uint8x16& a)
31 {
32 #if SIMDPP_USE_NULL
33     uint8_t r = a.el(0);
34     for (unsigned i = 0; i < a.length; i++) {
35         r = r > a.el(i) ? r : a.el(i);
36     }
37     return r;
38 #elif SIMDPP_USE_NEON64
39     return vmaxvq_u8(a.native());
40 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
41     uint8x16 r = max(a, move16_l<8>(a));
42     r = max(r, move16_l<4>(r));
43     r = max(r, move16_l<2>(r));
44     r = max(r, move16_l<1>(r));
45     return extract<0>(r);
46 #endif
47 }
48 
49 #if SIMDPP_USE_AVX2
50 static SIMDPP_INL
i_reduce_max(const uint8<32> & a)51 uint8_t i_reduce_max(const uint8<32>& a)
52 {
53     uint8x16 r = detail::extract128<0>(a);
54     r = max(r, detail::extract128<1>(a));
55     return i_reduce_max(r);
56 }
57 #endif
58 
59 #if SIMDPP_USE_AVX512BW
i_reduce_max(const uint8<64> & a)60 SIMDPP_INL uint8_t i_reduce_max(const uint8<64>& a)
61 {
62     uint8<32> r = detail::extract256<0>(a);
63     r = max(r, detail::extract256<1>(a));
64     return i_reduce_max(r);
65 }
66 #endif
67 
68 template<unsigned N>
i_reduce_max(const uint8<N> & a)69 SIMDPP_INL uint8_t i_reduce_max(const uint8<N>& a)
70 {
71 #if SIMDPP_USE_NULL
72     uint8_t r = std::numeric_limits<uint8_t>::min();
73     for (unsigned j = 0; j < a.vec_length; ++j) {
74         for (unsigned i = 0; i < a.base_length; i++) {
75             r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
76         }
77     }
78     return r;
79 #else
80     uint8v r = a.vec(0);
81     for (unsigned j = 1; j < a.vec_length; ++j) {
82         r = max(r, a.vec(j));
83     }
84     return i_reduce_max(r);
85 #endif
86 }
87 
88 // -----------------------------------------------------------------------------
89 
90 static SIMDPP_INL
i_reduce_max(const int8x16 & a)91 int8_t i_reduce_max(const int8x16& a)
92 {
93 #if SIMDPP_USE_NULL
94     int8_t r = a.el(0);
95     for (unsigned i = 0; i < a.length; i++) {
96         r = r > a.el(i) ? r : a.el(i);
97     }
98     return r;
99 #elif SIMDPP_USE_NEON64
100     return vmaxvq_s8(a.native());
101 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
102     int8x16 r = a;
103     r = max(r, move16_l<8>(r));
104     r = max(r, move16_l<4>(r));
105     r = max(r, move16_l<2>(r));
106     r = max(r, move16_l<1>(r));
107     return extract<0>(r);
108 #elif SIMDPP_USE_SSE2
109     // no instruction for int8 max available, only for uint8
110     uint8x16 ca = bit_xor(a, 0x80);
111     return i_reduce_max(ca) ^ 0x80;
112 #endif
113 }
114 
115 #if SIMDPP_USE_AVX2
116 static SIMDPP_INL
i_reduce_max(const int8<32> & a)117 int8_t i_reduce_max(const int8<32>& a)
118 {
119     int8x16 r = detail::extract128<0>(a);
120     r = max(r, detail::extract128<1>(a));
121     return i_reduce_max(r);
122 }
123 #endif
124 
125 #if SIMDPP_USE_AVX512BW
i_reduce_max(const int8<64> & a)126 SIMDPP_INL int8_t i_reduce_max(const int8<64>& a)
127 {
128     int8<32> r = detail::extract256<0>(a);
129     r = max(r, detail::extract256<1>(a));
130     return i_reduce_max(r);
131 }
132 #endif
133 
134 template<unsigned N>
i_reduce_max(const int8<N> & a)135 SIMDPP_INL int8_t i_reduce_max(const int8<N>& a)
136 {
137 #if SIMDPP_USE_NULL
138     int8_t r = std::numeric_limits<int8_t>::min();;
139     for (unsigned j = 0; j < a.vec_length; ++j) {
140         for (unsigned i = 0; i < a.base_length; i++) {
141             r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
142         }
143     }
144     return r;
145 #elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
146     // no instruction for int8 max available, only for uint8
147     uint8x16 r = bit_xor(a.vec(0), 0x80);
148     for (unsigned j = 1; j < a.vec_length; ++j) {
149         uint8x16 ca = bit_xor(a.vec(j), 0x80);
150         r = max(r, ca);
151     }
152     return i_reduce_max(r) ^ 0x80;
153 #else
154     int8v r = a.vec(0);
155     for (unsigned j = 1; j < a.vec_length; ++j) {
156         r = max(r, a.vec(j));
157     }
158     return i_reduce_max(r);
159 #endif
160 }
161 
162 // -----------------------------------------------------------------------------
163 static SIMDPP_INL
164 int16_t i_reduce_max(const int16x8& a);
165 
166 static SIMDPP_INL
i_reduce_max(const uint16x8 & a)167 uint16_t i_reduce_max(const uint16x8& a)
168 {
169 #if SIMDPP_USE_NULL
170     uint16_t r = a.el(0);
171     for (unsigned i = 0; i < a.length; i++) {
172         r = r > a.el(i) ? r : a.el(i);
173     }
174     return r;
175 #elif SIMDPP_USE_NEON64
176     return vmaxvq_u16(a.native());
177 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
178     uint16x8 r = max(a, move8_l<4>(a));
179     r = max(r, move8_l<2>(r));
180     r = max(r, move8_l<1>(r));
181     return extract<0>(r);
182 #elif SIMDPP_USE_SSE2
183     // no instruction for uint16 max available, only for int16
184     int16x8 ca = bit_xor(a, 0x8000);
185     return i_reduce_max(ca) ^ 0x8000;
186 #endif
187 }
188 
189 #if SIMDPP_USE_AVX2
190 static SIMDPP_INL
i_reduce_max(const uint16x16 & a)191 uint16_t i_reduce_max(const uint16x16& a)
192 {
193     uint16x8 r = detail::extract128<0>(a);
194     r = max(r, detail::extract128<1>(a));
195     return i_reduce_max(r);
196 }
197 #endif
198 
199 #if SIMDPP_USE_AVX512BW
i_reduce_max(const uint16<32> & a)200 SIMDPP_INL uint16_t i_reduce_max(const uint16<32>& a)
201 {
202     uint16<16> r = detail::extract256<0>(a);
203     r = max(r, detail::extract256<1>(a));
204     return i_reduce_max(r);
205 }
206 #endif
207 
208 template<unsigned N>
i_reduce_max(const uint16<N> & a)209 SIMDPP_INL uint16_t i_reduce_max(const uint16<N>& a)
210 {
211 #if SIMDPP_USE_NULL
212     uint16_t r = std::numeric_limits<uint16_t>::min();;
213     for (unsigned j = 0; j < a.vec_length; ++j) {
214         for (unsigned i = 0; i < a.base_length; i++) {
215             r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
216         }
217     }
218     return r;
219 #elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
220     // no instruction for uint16 max available, only for int16
221     int16x8 r = bit_xor(a.vec(0), 0x8000);
222     for (unsigned j = 1; j < a.vec_length; ++j) {
223         int16x8 ca = bit_xor(a.vec(j), 0x8000);
224         r = max(r, ca);
225     }
226     return i_reduce_max(r) ^ 0x8000;
227 #else
228     uint16v r = a.vec(0);
229     for (unsigned j = 1; j < a.vec_length; ++j) {
230         r = max(r, a.vec(j));
231     }
232     return i_reduce_max(r);
233 #endif
234 }
235 
236 // -----------------------------------------------------------------------------
237 
238 static SIMDPP_INL
i_reduce_max(const int16x8 & a)239 int16_t i_reduce_max(const int16x8& a)
240 {
241 #if SIMDPP_USE_NULL
242     int16_t r = a.el(0);
243     for (unsigned i = 0; i < a.length; i++) {
244         r = r > a.el(i) ? r : a.el(i);
245     }
246     return r;
247 #elif SIMDPP_USE_NEON64
248     return vmaxvq_s16(a.native());
249 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
250     int16x8 r = max(a, move8_l<4>(a));
251     r = max(r, move8_l<2>(r));
252     r = max(r, move8_l<1>(r));
253     return extract<0>(r);
254 #endif
255 }
256 
257 #if SIMDPP_USE_AVX2
258 static SIMDPP_INL
i_reduce_max(const int16x16 & a)259 int16_t i_reduce_max(const int16x16& a)
260 {
261     int16x8 r = detail::extract128<0>(a);
262     r = max(r, detail::extract128<1>(a));
263     return i_reduce_max(r);
264 }
265 #endif
266 
267 #if SIMDPP_USE_AVX512BW
i_reduce_max(const int16<32> & a)268 SIMDPP_INL int16_t i_reduce_max(const int16<32>& a)
269 {
270     int16<16> r = detail::extract256<0>(a);
271     r = max(r, detail::extract256<1>(a));
272     return i_reduce_max(r);
273 }
274 #endif
275 
276 template<unsigned N>
i_reduce_max(const int16<N> & a)277 SIMDPP_INL int16_t i_reduce_max(const int16<N>& a)
278 {
279 #if SIMDPP_USE_NULL
280     int16_t r = std::numeric_limits<int16_t>::min();;
281     for (unsigned j = 0; j < a.vec_length; ++j) {
282         for (unsigned i = 0; i < a.base_length; i++) {
283             r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
284         }
285     }
286     return r;
287 #else
288     int16v r = a.vec(0);
289     for (unsigned j = 1; j < a.vec_length; ++j) {
290         r = max(r, a.vec(j));
291     }
292     return i_reduce_max(r);
293 #endif
294 }
295 
296 // -----------------------------------------------------------------------------
297 
298 static SIMDPP_INL
i_reduce_max(const uint32x4 & a)299 uint32_t i_reduce_max(const uint32x4& a)
300 {
301 #if SIMDPP_USE_NULL
302     uint32_t r = a.el(0);
303     for (unsigned i = 0; i < a.length; i++) {
304         r = r > a.el(i) ? r : a.el(i);
305     }
306     return r;
307 #elif SIMDPP_USE_NEON64
308     return vmaxvq_u32(a.native());
309 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
310     uint32x4 r = max(a, move4_l<2>(a));
311     r = max(r, move4_l<1>(r));
312     return extract<0>(r);
313 #elif SIMDPP_USE_SSE2
314     mem_block<uint32x4> b = a;
315     uint32_t r = b[0];
316     for (unsigned i = 1; i < b.length; i++) {
317         r = r > b[i] ? r : b[i];
318     }
319     return r;
320 #endif
321 }
322 
323 #if SIMDPP_USE_AVX2
324 static SIMDPP_INL
i_reduce_max(const uint32x8 & a)325 uint32_t i_reduce_max(const uint32x8& a)
326 {
327     uint32x4 r = detail::extract128<0>(a);
328     r = max(r, detail::extract128<1>(a));
329     r = max(r, move4_l<2>(r));
330     r = max(r, move4_l<1>(r));
331     return extract<0>(r);
332 }
333 #endif
334 
335 #if SIMDPP_USE_AVX512F
336 static SIMDPP_INL
i_reduce_max(const uint32<16> & a)337 uint32_t i_reduce_max(const uint32<16>& a)
338 {
339     return i_reduce_max((uint32<8>)max(extract256<0>(a), extract256<1>(a)));
340 }
341 #endif
342 
343 template<unsigned N>
i_reduce_max(const uint32<N> & a)344 SIMDPP_INL uint32_t i_reduce_max(const uint32<N>& a)
345 {
346 #if SIMDPP_USE_NULL
347     uint32_t r = std::numeric_limits<uint32_t>::min();;
348     for (unsigned j = 0; j < a.vec_length; ++j) {
349         for (unsigned i = 0; i < a.base_length; i++) {
350             r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
351         }
352     }
353     return r;
354 #else
355     uint32v r = a.vec(0);
356     for (unsigned j = 1; j < a.vec_length; ++j) {
357         r = max(r, a.vec(j));
358     }
359     return i_reduce_max(r);
360 #endif
361 }
362 
363 // -----------------------------------------------------------------------------
364 
365 static SIMDPP_INL
i_reduce_max(const int32x4 & a)366 int32_t i_reduce_max(const int32x4& a)
367 {
368 #if SIMDPP_USE_NULL
369     int32_t r = a.el(0);
370     for (unsigned i = 0; i < a.length; i++) {
371         r = r > a.el(i) ? r : a.el(i);
372     }
373     return r;
374 #elif SIMDPP_USE_NEON64
375     return vmaxvq_s32(a.native());
376 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
377     int32x4 r = max(a, move4_l<2>(a));
378     r = max(r, move4_l<1>(r));
379     return extract<0>(r);
380 #endif
381 }
382 
383 #if SIMDPP_USE_AVX2
384 static SIMDPP_INL
i_reduce_max(const int32x8 & a)385 int32_t i_reduce_max(const int32x8& a)
386 {
387     int32x4 r = detail::extract128<0>(a);
388     r = max(r, detail::extract128<1>(a));
389     r = max(r, move4_l<2>(r));
390     r = max(r, move4_l<1>(r));
391     return extract<0>(r);
392 }
393 #endif
394 
395 #if SIMDPP_USE_AVX512F
396 static SIMDPP_INL
i_reduce_max(const int32<16> & a)397 int32_t i_reduce_max(const int32<16>& a)
398 {
399     return i_reduce_max((int32<8>)max(extract256<0>(a), extract256<1>(a)));
400 }
401 #endif
402 
403 template<unsigned N>
i_reduce_max(const int32<N> & a)404 SIMDPP_INL int32_t i_reduce_max(const int32<N>& a)
405 {
406 #if SIMDPP_USE_NULL
407     int32_t r = std::numeric_limits<int32_t>::min();;
408     for (unsigned j = 0; j < a.vec_length; ++j) {
409         for (unsigned i = 0; i < a.base_length; i++) {
410             r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
411         }
412     }
413     return r;
414 #else
415     int32v r = a.vec(0);
416     for (unsigned j = 1; j < a.vec_length; ++j) {
417         r = max(r, a.vec(j));
418     }
419     return i_reduce_max(r);
420 #endif
421 }
422 
423 // -----------------------------------------------------------------------------
424 
425 static SIMDPP_INL
i_reduce_max(const uint64x2 & a)426 uint64_t i_reduce_max(const uint64x2& a)
427 {
428 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207
429     uint64x2 r = max(a, move2_l<1>(a));
430     return extract<0>(r);
431 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
432     mem_block<uint64x2> b = a;
433     return b[0] > b[1] ? b[0] : b[1];
434 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
435     uint64_t r = a.el(0);
436     for (unsigned i = 0; i < a.length; i++) {
437         r = r > a.el(i) ? r : a.el(i);
438     }
439     return r;
440 #else
441     return SIMDPP_NOT_IMPLEMENTED1(a);
442 #endif
443 }
444 
445 #if SIMDPP_USE_AVX2
446 static SIMDPP_INL
i_reduce_max(const uint64x4 & a)447 uint64_t i_reduce_max(const uint64x4& a)
448 {
449     uint64x2 r = detail::extract128<0>(a);
450     r = max(r, detail::extract128<1>(a));
451     r = max(r, move2_l<1>(r));
452     return extract<0>(r);
453 }
454 #endif
455 
456 #if SIMDPP_USE_AVX512F
457 static SIMDPP_INL
i_reduce_max(const uint64<8> & a)458 uint64_t i_reduce_max(const uint64<8>& a)
459 {
460     return i_reduce_max((uint64<4>)max(extract256<0>(a), extract256<1>(a)));
461 }
462 #endif
463 
464 template<unsigned N>
i_reduce_max(const uint64<N> & a)465 SIMDPP_INL uint64_t i_reduce_max(const uint64<N>& a)
466 {
467 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
468     uint64v r = a.vec(0);
469     for (unsigned j = 1; j < a.vec_length; ++j) {
470         r = max(r, a.vec(j));
471     }
472     return i_reduce_max(r);
473 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
474     uint64_t r = std::numeric_limits<uint64_t>::min();
475     for (unsigned j = 0; j < a.vec_length; ++j) {
476         mem_block<uint64v> b = a.vec(j);
477         for (unsigned i = 0; i < a.base_length; i++) {
478             r = r > b[i] ? r : b[i];
479         }
480     }
481     return r;
482 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
483     uint64_t r = std::numeric_limits<uint64_t>::min();;
484     for (unsigned j = 0; j < a.vec_length; ++j) {
485         for (unsigned i = 0; i < a.base_length; i++) {
486             r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
487         }
488     }
489     return r;
490 #else
491     return SIMDPP_NOT_IMPLEMENTED1(a);
492 #endif
493 }
494 
495 // -----------------------------------------------------------------------------
496 
497 static SIMDPP_INL
i_reduce_max(const int64x2 & a)498 int64_t i_reduce_max(const int64x2& a)
499 {
500 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
501     int64x2 r = max(a, move2_l<1>(a));
502     return extract<0>(r);
503 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
504     mem_block<int64x2> b = a;
505     return b[0] > b[1] ? b[0] : b[1];
506 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
507     int64_t r = a.el(0);
508     for (unsigned i = 0; i < a.length; i++) {
509         r = r > a.el(i) ? r : a.el(i);
510     }
511     return r;
512 #else
513     return SIMDPP_NOT_IMPLEMENTED1(a);
514 #endif
515 }
516 
517 #if SIMDPP_USE_AVX2
518 static SIMDPP_INL
i_reduce_max(const int64x4 & a)519 int64_t i_reduce_max(const int64x4& a)
520 {
521     int64x2 r = detail::extract128<0>(a);
522     r = max(r, detail::extract128<1>(a));
523     r = max(r, move2_l<1>(r));
524     return extract<0>(r);
525 }
526 #endif
527 
528 #if SIMDPP_USE_AVX512F
529 static SIMDPP_INL
i_reduce_max(const int64<8> & a)530 int64_t i_reduce_max(const int64<8>& a)
531 {
532     return i_reduce_max((int64<4>)max(extract256<0>(a), extract256<1>(a)));
533 }
534 #endif
535 
536 template<unsigned N>
i_reduce_max(const int64<N> & a)537 SIMDPP_INL int64_t i_reduce_max(const int64<N>& a)
538 {
539 #if SIMDPP_USE_AXV2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
540     int64v r = a.vec(0);
541     for (unsigned j = 1; j < a.vec_length; ++j) {
542         r = max(r, a.vec(j));
543     }
544     return i_reduce_max(r);
545 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
546     int64_t r = std::numeric_limits<int64_t>::min();;
547     for (unsigned j = 0; j < a.vec_length; ++j) {
548         mem_block<int64v> b = a.vec(j);
549         for (unsigned i = 0; i < a.base_length; i++) {
550             r = r > b[i] ? r : b[i];
551         }
552     }
553     return r;
554 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
555     int64_t r = std::numeric_limits<int64_t>::min();;
556     for (unsigned j = 0; j < a.vec_length; ++j) {
557         for (unsigned i = 0; i < a.base_length; i++) {
558             r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
559         }
560     }
561     return r;
562 #else
563     return SIMDPP_NOT_IMPLEMENTED1(a);
564 #endif
565 }
566 
567 // -----------------------------------------------------------------------------
568 
569 } // namespace insn
570 } // namespace detail
571 } // namespace SIMDPP_ARCH_NAMESPACE
572 } // namespace simdpp
573 
574 #endif
575 
576