1 /* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_MAX_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_MAX_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/core/extract.h>
17 #include <simdpp/core/f_max.h>
18 #include <simdpp/core/permute2.h>
19 #include <simdpp/detail/extract128.h>
20 #include <simdpp/detail/workarounds.h>
21
22 namespace simdpp {
23 namespace SIMDPP_ARCH_NAMESPACE {
24 namespace detail {
25 namespace insn {
26
27
28 static SIMDPP_INL
i_reduce_max(const float32x4 & a)29 float i_reduce_max(const float32x4& a)
30 {
31 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
32 float r = a.el(0);
33 for (unsigned i = 1; i < a.length; i++) {
34 r = r > a.el(i) ? r : a.el(i); // TODO nan
35 }
36 return r;
37 #elif SIMDPP_USE_SSE2
38 float32x4 b = _mm_movehl_ps(a.native(), a.native());
39 b = max(a, b);
40 b = max(b, permute2<1,1>(b));
41 return _mm_cvtss_f32(b.native());
42 #elif SIMDPP_USE_NEON64
43 return vmaxnmvq_f32(a.native());
44 #elif SIMDPP_USE_NEON_FLT_SP
45 float32x2_t a2 = vpmax_f32(vget_low_f32(a.native()), vget_high_f32(a.native()));
46 a2 = vpmax_f32(a2, a2);
47 return vget_lane_f32(a2, 0);
48 #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
49 float32x4 b = a;
50 b = max(b, move4_l<1>(b));
51 b = max(b, move4_l<2>(b));
52 return extract<0>(b);
53 #endif
54 }
55
56 #if SIMDPP_USE_AVX
57 static SIMDPP_INL
i_reduce_max(const float32x8 & a)58 float i_reduce_max(const float32x8& a)
59 {
60 float32x4 ah = detail::extract128<1>(a);
61 float32x4 al = detail::extract128<0>(a);
62 al = max(al, ah);
63 return i_reduce_max(al);
64 }
65 #endif
66
67 #if SIMDPP_USE_AVX512F
68 static SIMDPP_INL
i_reduce_max(const float32<16> & a)69 float i_reduce_max(const float32<16>& a)
70 {
71 return i_reduce_max(max(extract256<0>(a), extract256<1>(a)));
72 }
73 #endif
74
75 template<unsigned N>
i_reduce_max(const float32<N> & a)76 SIMDPP_INL float i_reduce_max(const float32<N>& a)
77 {
78 float32v r = a.vec(0);
79 for (unsigned i = 1; i < a.vec_length; ++i)
80 r = max(r, a.vec(i));
81 return i_reduce_max(r);
82 }
83
84 // -----------------------------------------------------------------------------
85
86 static SIMDPP_INL
i_reduce_max(const float64x2 & a)87 double i_reduce_max(const float64x2& a)
88 {
89 #if SIMDPP_USE_SSE2
90 float64x2 b = max(a, permute2<1,1>(a));
91 return _mm_cvtsd_f64(b.native());
92 #elif SIMDPP_USE_NEON64
93 return vmaxnmvq_f64(a.native());
94 #elif SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
95 float64x2 b = max(a, permute2<1,1>(a));
96 return extract<0>(b);
97 #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
98 double r = a.el(0);
99 for (unsigned i = 1; i < a.length; i++) {
100 r = r > a.el(i) ? r : a.el(i); // TODO nan
101 }
102 return r;
103 #endif
104 }
105
106 #if SIMDPP_USE_AVX
107 static SIMDPP_INL
i_reduce_max(const float64x4 & a)108 double i_reduce_max(const float64x4& a)
109 {
110 float64x2 ah = detail::extract128<1>(a);
111 float64x2 al = detail::extract128<0>(a);
112 al = max(al, ah);
113 return i_reduce_max(al);
114 }
115 #endif
116
117 #if SIMDPP_USE_AVX512F
118 static SIMDPP_INL
i_reduce_max(const float64<8> & a)119 double i_reduce_max(const float64<8>& a)
120 {
121 return i_reduce_max(max(extract256<0>(a), extract256<1>(a)));
122 }
123 #endif
124
125 template<unsigned N>
i_reduce_max(const float64<N> & a)126 SIMDPP_INL double i_reduce_max(const float64<N>& a)
127 {
128 float64v r = a.vec(0);
129 for (unsigned i = 1; i < a.vec_length; ++i)
130 r = max(r, a.vec(i));
131 return i_reduce_max(r);
132 }
133
134 } // namespace insn
135 } // namespace detail
136 } // namespace SIMDPP_ARCH_NAMESPACE
137 } // namespace simdpp
138
139 #endif
140
141