1 /* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MAX_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MAX_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/core/i_max.h>
17 #include <simdpp/core/extract.h>
18 #include <simdpp/core/move_l.h>
19 #include <simdpp/core/make_uint.h>
20 #include <simdpp/detail/mem_block.h>
21 #include <simdpp/detail/extract128.h>
22 #include <limits>
23
24 namespace simdpp {
25 namespace SIMDPP_ARCH_NAMESPACE {
26 namespace detail {
27 namespace insn {
28
29 static SIMDPP_INL
i_reduce_max(const uint8x16 & a)30 uint8_t i_reduce_max(const uint8x16& a)
31 {
32 #if SIMDPP_USE_NULL
33 uint8_t r = a.el(0);
34 for (unsigned i = 0; i < a.length; i++) {
35 r = r > a.el(i) ? r : a.el(i);
36 }
37 return r;
38 #elif SIMDPP_USE_NEON64
39 return vmaxvq_u8(a.native());
40 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
41 uint8x16 r = max(a, move16_l<8>(a));
42 r = max(r, move16_l<4>(r));
43 r = max(r, move16_l<2>(r));
44 r = max(r, move16_l<1>(r));
45 return extract<0>(r);
46 #endif
47 }
48
49 #if SIMDPP_USE_AVX2
50 static SIMDPP_INL
i_reduce_max(const uint8<32> & a)51 uint8_t i_reduce_max(const uint8<32>& a)
52 {
53 uint8x16 r = detail::extract128<0>(a);
54 r = max(r, detail::extract128<1>(a));
55 return i_reduce_max(r);
56 }
57 #endif
58
59 #if SIMDPP_USE_AVX512BW
i_reduce_max(const uint8<64> & a)60 SIMDPP_INL uint8_t i_reduce_max(const uint8<64>& a)
61 {
62 uint8<32> r = detail::extract256<0>(a);
63 r = max(r, detail::extract256<1>(a));
64 return i_reduce_max(r);
65 }
66 #endif
67
68 template<unsigned N>
i_reduce_max(const uint8<N> & a)69 SIMDPP_INL uint8_t i_reduce_max(const uint8<N>& a)
70 {
71 #if SIMDPP_USE_NULL
72 uint8_t r = std::numeric_limits<uint8_t>::min();
73 for (unsigned j = 0; j < a.vec_length; ++j) {
74 for (unsigned i = 0; i < a.base_length; i++) {
75 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
76 }
77 }
78 return r;
79 #else
80 uint8v r = a.vec(0);
81 for (unsigned j = 1; j < a.vec_length; ++j) {
82 r = max(r, a.vec(j));
83 }
84 return i_reduce_max(r);
85 #endif
86 }
87
88 // -----------------------------------------------------------------------------
89
90 static SIMDPP_INL
i_reduce_max(const int8x16 & a)91 int8_t i_reduce_max(const int8x16& a)
92 {
93 #if SIMDPP_USE_NULL
94 int8_t r = a.el(0);
95 for (unsigned i = 0; i < a.length; i++) {
96 r = r > a.el(i) ? r : a.el(i);
97 }
98 return r;
99 #elif SIMDPP_USE_NEON64
100 return vmaxvq_s8(a.native());
101 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
102 int8x16 r = a;
103 r = max(r, move16_l<8>(r));
104 r = max(r, move16_l<4>(r));
105 r = max(r, move16_l<2>(r));
106 r = max(r, move16_l<1>(r));
107 return extract<0>(r);
108 #elif SIMDPP_USE_SSE2
109 // no instruction for int8 max available, only for uint8
110 uint8x16 ca = bit_xor(a, 0x80);
111 return i_reduce_max(ca) ^ 0x80;
112 #endif
113 }
114
115 #if SIMDPP_USE_AVX2
116 static SIMDPP_INL
i_reduce_max(const int8<32> & a)117 int8_t i_reduce_max(const int8<32>& a)
118 {
119 int8x16 r = detail::extract128<0>(a);
120 r = max(r, detail::extract128<1>(a));
121 return i_reduce_max(r);
122 }
123 #endif
124
125 #if SIMDPP_USE_AVX512BW
i_reduce_max(const int8<64> & a)126 SIMDPP_INL int8_t i_reduce_max(const int8<64>& a)
127 {
128 int8<32> r = detail::extract256<0>(a);
129 r = max(r, detail::extract256<1>(a));
130 return i_reduce_max(r);
131 }
132 #endif
133
134 template<unsigned N>
i_reduce_max(const int8<N> & a)135 SIMDPP_INL int8_t i_reduce_max(const int8<N>& a)
136 {
137 #if SIMDPP_USE_NULL
138 int8_t r = std::numeric_limits<int8_t>::min();;
139 for (unsigned j = 0; j < a.vec_length; ++j) {
140 for (unsigned i = 0; i < a.base_length; i++) {
141 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
142 }
143 }
144 return r;
145 #elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
146 // no instruction for int8 max available, only for uint8
147 uint8x16 r = bit_xor(a.vec(0), 0x80);
148 for (unsigned j = 1; j < a.vec_length; ++j) {
149 uint8x16 ca = bit_xor(a.vec(j), 0x80);
150 r = max(r, ca);
151 }
152 return i_reduce_max(r) ^ 0x80;
153 #else
154 int8v r = a.vec(0);
155 for (unsigned j = 1; j < a.vec_length; ++j) {
156 r = max(r, a.vec(j));
157 }
158 return i_reduce_max(r);
159 #endif
160 }
161
162 // -----------------------------------------------------------------------------
163 static SIMDPP_INL
164 int16_t i_reduce_max(const int16x8& a);
165
166 static SIMDPP_INL
i_reduce_max(const uint16x8 & a)167 uint16_t i_reduce_max(const uint16x8& a)
168 {
169 #if SIMDPP_USE_NULL
170 uint16_t r = a.el(0);
171 for (unsigned i = 0; i < a.length; i++) {
172 r = r > a.el(i) ? r : a.el(i);
173 }
174 return r;
175 #elif SIMDPP_USE_NEON64
176 return vmaxvq_u16(a.native());
177 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
178 uint16x8 r = max(a, move8_l<4>(a));
179 r = max(r, move8_l<2>(r));
180 r = max(r, move8_l<1>(r));
181 return extract<0>(r);
182 #elif SIMDPP_USE_SSE2
183 // no instruction for uint16 max available, only for int16
184 int16x8 ca = bit_xor(a, 0x8000);
185 return i_reduce_max(ca) ^ 0x8000;
186 #endif
187 }
188
189 #if SIMDPP_USE_AVX2
190 static SIMDPP_INL
i_reduce_max(const uint16x16 & a)191 uint16_t i_reduce_max(const uint16x16& a)
192 {
193 uint16x8 r = detail::extract128<0>(a);
194 r = max(r, detail::extract128<1>(a));
195 return i_reduce_max(r);
196 }
197 #endif
198
199 #if SIMDPP_USE_AVX512BW
i_reduce_max(const uint16<32> & a)200 SIMDPP_INL uint16_t i_reduce_max(const uint16<32>& a)
201 {
202 uint16<16> r = detail::extract256<0>(a);
203 r = max(r, detail::extract256<1>(a));
204 return i_reduce_max(r);
205 }
206 #endif
207
208 template<unsigned N>
i_reduce_max(const uint16<N> & a)209 SIMDPP_INL uint16_t i_reduce_max(const uint16<N>& a)
210 {
211 #if SIMDPP_USE_NULL
212 uint16_t r = std::numeric_limits<uint16_t>::min();;
213 for (unsigned j = 0; j < a.vec_length; ++j) {
214 for (unsigned i = 0; i < a.base_length; i++) {
215 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
216 }
217 }
218 return r;
219 #elif SIMDPP_USE_SSE2 && !SIMDPP_USE_SSE4_1
220 // no instruction for uint16 max available, only for int16
221 int16x8 r = bit_xor(a.vec(0), 0x8000);
222 for (unsigned j = 1; j < a.vec_length; ++j) {
223 int16x8 ca = bit_xor(a.vec(j), 0x8000);
224 r = max(r, ca);
225 }
226 return i_reduce_max(r) ^ 0x8000;
227 #else
228 uint16v r = a.vec(0);
229 for (unsigned j = 1; j < a.vec_length; ++j) {
230 r = max(r, a.vec(j));
231 }
232 return i_reduce_max(r);
233 #endif
234 }
235
236 // -----------------------------------------------------------------------------
237
238 static SIMDPP_INL
i_reduce_max(const int16x8 & a)239 int16_t i_reduce_max(const int16x8& a)
240 {
241 #if SIMDPP_USE_NULL
242 int16_t r = a.el(0);
243 for (unsigned i = 0; i < a.length; i++) {
244 r = r > a.el(i) ? r : a.el(i);
245 }
246 return r;
247 #elif SIMDPP_USE_NEON64
248 return vmaxvq_s16(a.native());
249 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
250 int16x8 r = max(a, move8_l<4>(a));
251 r = max(r, move8_l<2>(r));
252 r = max(r, move8_l<1>(r));
253 return extract<0>(r);
254 #endif
255 }
256
257 #if SIMDPP_USE_AVX2
258 static SIMDPP_INL
i_reduce_max(const int16x16 & a)259 int16_t i_reduce_max(const int16x16& a)
260 {
261 int16x8 r = detail::extract128<0>(a);
262 r = max(r, detail::extract128<1>(a));
263 return i_reduce_max(r);
264 }
265 #endif
266
267 #if SIMDPP_USE_AVX512BW
i_reduce_max(const int16<32> & a)268 SIMDPP_INL int16_t i_reduce_max(const int16<32>& a)
269 {
270 int16<16> r = detail::extract256<0>(a);
271 r = max(r, detail::extract256<1>(a));
272 return i_reduce_max(r);
273 }
274 #endif
275
276 template<unsigned N>
i_reduce_max(const int16<N> & a)277 SIMDPP_INL int16_t i_reduce_max(const int16<N>& a)
278 {
279 #if SIMDPP_USE_NULL
280 int16_t r = std::numeric_limits<int16_t>::min();;
281 for (unsigned j = 0; j < a.vec_length; ++j) {
282 for (unsigned i = 0; i < a.base_length; i++) {
283 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
284 }
285 }
286 return r;
287 #else
288 int16v r = a.vec(0);
289 for (unsigned j = 1; j < a.vec_length; ++j) {
290 r = max(r, a.vec(j));
291 }
292 return i_reduce_max(r);
293 #endif
294 }
295
296 // -----------------------------------------------------------------------------
297
298 static SIMDPP_INL
i_reduce_max(const uint32x4 & a)299 uint32_t i_reduce_max(const uint32x4& a)
300 {
301 #if SIMDPP_USE_NULL
302 uint32_t r = a.el(0);
303 for (unsigned i = 0; i < a.length; i++) {
304 r = r > a.el(i) ? r : a.el(i);
305 }
306 return r;
307 #elif SIMDPP_USE_NEON64
308 return vmaxvq_u32(a.native());
309 #elif SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
310 uint32x4 r = max(a, move4_l<2>(a));
311 r = max(r, move4_l<1>(r));
312 return extract<0>(r);
313 #elif SIMDPP_USE_SSE2
314 mem_block<uint32x4> b = a;
315 uint32_t r = b[0];
316 for (unsigned i = 1; i < b.length; i++) {
317 r = r > b[i] ? r : b[i];
318 }
319 return r;
320 #endif
321 }
322
323 #if SIMDPP_USE_AVX2
324 static SIMDPP_INL
i_reduce_max(const uint32x8 & a)325 uint32_t i_reduce_max(const uint32x8& a)
326 {
327 uint32x4 r = detail::extract128<0>(a);
328 r = max(r, detail::extract128<1>(a));
329 r = max(r, move4_l<2>(r));
330 r = max(r, move4_l<1>(r));
331 return extract<0>(r);
332 }
333 #endif
334
335 #if SIMDPP_USE_AVX512F
336 static SIMDPP_INL
i_reduce_max(const uint32<16> & a)337 uint32_t i_reduce_max(const uint32<16>& a)
338 {
339 return i_reduce_max((uint32<8>)max(extract256<0>(a), extract256<1>(a)));
340 }
341 #endif
342
343 template<unsigned N>
i_reduce_max(const uint32<N> & a)344 SIMDPP_INL uint32_t i_reduce_max(const uint32<N>& a)
345 {
346 #if SIMDPP_USE_NULL
347 uint32_t r = std::numeric_limits<uint32_t>::min();;
348 for (unsigned j = 0; j < a.vec_length; ++j) {
349 for (unsigned i = 0; i < a.base_length; i++) {
350 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
351 }
352 }
353 return r;
354 #else
355 uint32v r = a.vec(0);
356 for (unsigned j = 1; j < a.vec_length; ++j) {
357 r = max(r, a.vec(j));
358 }
359 return i_reduce_max(r);
360 #endif
361 }
362
363 // -----------------------------------------------------------------------------
364
365 static SIMDPP_INL
i_reduce_max(const int32x4 & a)366 int32_t i_reduce_max(const int32x4& a)
367 {
368 #if SIMDPP_USE_NULL
369 int32_t r = a.el(0);
370 for (unsigned i = 0; i < a.length; i++) {
371 r = r > a.el(i) ? r : a.el(i);
372 }
373 return r;
374 #elif SIMDPP_USE_NEON64
375 return vmaxvq_s32(a.native());
376 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
377 int32x4 r = max(a, move4_l<2>(a));
378 r = max(r, move4_l<1>(r));
379 return extract<0>(r);
380 #endif
381 }
382
383 #if SIMDPP_USE_AVX2
384 static SIMDPP_INL
i_reduce_max(const int32x8 & a)385 int32_t i_reduce_max(const int32x8& a)
386 {
387 int32x4 r = detail::extract128<0>(a);
388 r = max(r, detail::extract128<1>(a));
389 r = max(r, move4_l<2>(r));
390 r = max(r, move4_l<1>(r));
391 return extract<0>(r);
392 }
393 #endif
394
395 #if SIMDPP_USE_AVX512F
396 static SIMDPP_INL
i_reduce_max(const int32<16> & a)397 int32_t i_reduce_max(const int32<16>& a)
398 {
399 return i_reduce_max((int32<8>)max(extract256<0>(a), extract256<1>(a)));
400 }
401 #endif
402
403 template<unsigned N>
i_reduce_max(const int32<N> & a)404 SIMDPP_INL int32_t i_reduce_max(const int32<N>& a)
405 {
406 #if SIMDPP_USE_NULL
407 int32_t r = std::numeric_limits<int32_t>::min();;
408 for (unsigned j = 0; j < a.vec_length; ++j) {
409 for (unsigned i = 0; i < a.base_length; i++) {
410 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
411 }
412 }
413 return r;
414 #else
415 int32v r = a.vec(0);
416 for (unsigned j = 1; j < a.vec_length; ++j) {
417 r = max(r, a.vec(j));
418 }
419 return i_reduce_max(r);
420 #endif
421 }
422
423 // -----------------------------------------------------------------------------
424
425 static SIMDPP_INL
i_reduce_max(const uint64x2 & a)426 uint64_t i_reduce_max(const uint64x2& a)
427 {
428 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207
429 uint64x2 r = max(a, move2_l<1>(a));
430 return extract<0>(r);
431 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
432 mem_block<uint64x2> b = a;
433 return b[0] > b[1] ? b[0] : b[1];
434 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
435 uint64_t r = a.el(0);
436 for (unsigned i = 0; i < a.length; i++) {
437 r = r > a.el(i) ? r : a.el(i);
438 }
439 return r;
440 #else
441 return SIMDPP_NOT_IMPLEMENTED1(a);
442 #endif
443 }
444
445 #if SIMDPP_USE_AVX2
446 static SIMDPP_INL
i_reduce_max(const uint64x4 & a)447 uint64_t i_reduce_max(const uint64x4& a)
448 {
449 uint64x2 r = detail::extract128<0>(a);
450 r = max(r, detail::extract128<1>(a));
451 r = max(r, move2_l<1>(r));
452 return extract<0>(r);
453 }
454 #endif
455
456 #if SIMDPP_USE_AVX512F
457 static SIMDPP_INL
i_reduce_max(const uint64<8> & a)458 uint64_t i_reduce_max(const uint64<8>& a)
459 {
460 return i_reduce_max((uint64<4>)max(extract256<0>(a), extract256<1>(a)));
461 }
462 #endif
463
464 template<unsigned N>
i_reduce_max(const uint64<N> & a)465 SIMDPP_INL uint64_t i_reduce_max(const uint64<N>& a)
466 {
467 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
468 uint64v r = a.vec(0);
469 for (unsigned j = 1; j < a.vec_length; ++j) {
470 r = max(r, a.vec(j));
471 }
472 return i_reduce_max(r);
473 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
474 uint64_t r = std::numeric_limits<uint64_t>::min();
475 for (unsigned j = 0; j < a.vec_length; ++j) {
476 mem_block<uint64v> b = a.vec(j);
477 for (unsigned i = 0; i < a.base_length; i++) {
478 r = r > b[i] ? r : b[i];
479 }
480 }
481 return r;
482 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
483 uint64_t r = std::numeric_limits<uint64_t>::min();;
484 for (unsigned j = 0; j < a.vec_length; ++j) {
485 for (unsigned i = 0; i < a.base_length; i++) {
486 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
487 }
488 }
489 return r;
490 #else
491 return SIMDPP_NOT_IMPLEMENTED1(a);
492 #endif
493 }
494
495 // -----------------------------------------------------------------------------
496
497 static SIMDPP_INL
i_reduce_max(const int64x2 & a)498 int64_t i_reduce_max(const int64x2& a)
499 {
500 #if SIMDPP_USE_AVX2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
501 int64x2 r = max(a, move2_l<1>(a));
502 return extract<0>(r);
503 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
504 mem_block<int64x2> b = a;
505 return b[0] > b[1] ? b[0] : b[1];
506 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
507 int64_t r = a.el(0);
508 for (unsigned i = 0; i < a.length; i++) {
509 r = r > a.el(i) ? r : a.el(i);
510 }
511 return r;
512 #else
513 return SIMDPP_NOT_IMPLEMENTED1(a);
514 #endif
515 }
516
517 #if SIMDPP_USE_AVX2
518 static SIMDPP_INL
i_reduce_max(const int64x4 & a)519 int64_t i_reduce_max(const int64x4& a)
520 {
521 int64x2 r = detail::extract128<0>(a);
522 r = max(r, detail::extract128<1>(a));
523 r = max(r, move2_l<1>(r));
524 return extract<0>(r);
525 }
526 #endif
527
528 #if SIMDPP_USE_AVX512F
529 static SIMDPP_INL
i_reduce_max(const int64<8> & a)530 int64_t i_reduce_max(const int64<8>& a)
531 {
532 return i_reduce_max((int64<4>)max(extract256<0>(a), extract256<1>(a)));
533 }
534 #endif
535
536 template<unsigned N>
i_reduce_max(const int64<N> & a)537 SIMDPP_INL int64_t i_reduce_max(const int64<N>& a)
538 {
539 #if SIMDPP_USE_AXV2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
540 int64v r = a.vec(0);
541 for (unsigned j = 1; j < a.vec_length; ++j) {
542 r = max(r, a.vec(j));
543 }
544 return i_reduce_max(r);
545 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON32
546 int64_t r = std::numeric_limits<int64_t>::min();;
547 for (unsigned j = 0; j < a.vec_length; ++j) {
548 mem_block<int64v> b = a.vec(j);
549 for (unsigned i = 0; i < a.base_length; i++) {
550 r = r > b[i] ? r : b[i];
551 }
552 }
553 return r;
554 #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
555 int64_t r = std::numeric_limits<int64_t>::min();;
556 for (unsigned j = 0; j < a.vec_length; ++j) {
557 for (unsigned i = 0; i < a.base_length; i++) {
558 r = r > a.vec(j).el(i) ? r : a.vec(j).el(i);
559 }
560 }
561 return r;
562 #else
563 return SIMDPP_NOT_IMPLEMENTED1(a);
564 #endif
565 }
566
567 // -----------------------------------------------------------------------------
568
569 } // namespace insn
570 } // namespace detail
571 } // namespace SIMDPP_ARCH_NAMESPACE
572 } // namespace simdpp
573
574 #endif
575
576