1 /*  Copyright (C) 2016  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_MAX_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_REDUCE_MAX_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/extract.h>
17 #include <simdpp/core/f_max.h>
18 #include <simdpp/core/permute2.h>
19 #include <simdpp/detail/extract128.h>
20 #include <simdpp/detail/workarounds.h>
21 
22 namespace simdpp {
23 namespace SIMDPP_ARCH_NAMESPACE {
24 namespace detail {
25 namespace insn {
26 
27 
28 static SIMDPP_INL
i_reduce_max(const float32x4 & a)29 float i_reduce_max(const float32x4& a)
30 {
31 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
32     float r = a.el(0);
33     for (unsigned i = 1; i < a.length; i++) {
34         r = r > a.el(i) ? r : a.el(i); // TODO nan
35     }
36     return r;
37 #elif SIMDPP_USE_SSE2
38     float32x4 b = _mm_movehl_ps(a.native(), a.native());
39     b = max(a, b);
40     b = max(b, permute2<1,1>(b));
41     return _mm_cvtss_f32(b.native());
42 #elif SIMDPP_USE_NEON64
43     return vmaxnmvq_f32(a.native());
44 #elif SIMDPP_USE_NEON_FLT_SP
45     float32x2_t a2 = vpmax_f32(vget_low_f32(a.native()), vget_high_f32(a.native()));
46     a2 = vpmax_f32(a2, a2);
47     return vget_lane_f32(a2, 0);
48 #elif SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
49     float32x4 b = a;
50     b = max(b, move4_l<1>(b));
51     b = max(b, move4_l<2>(b));
52     return extract<0>(b);
53 #endif
54 }
55 
56 #if SIMDPP_USE_AVX
57 static SIMDPP_INL
i_reduce_max(const float32x8 & a)58 float i_reduce_max(const float32x8& a)
59 {
60     float32x4 ah = detail::extract128<1>(a);
61     float32x4 al = detail::extract128<0>(a);
62     al = max(al, ah);
63     return i_reduce_max(al);
64 }
65 #endif
66 
67 #if SIMDPP_USE_AVX512F
68 static SIMDPP_INL
i_reduce_max(const float32<16> & a)69 float i_reduce_max(const float32<16>& a)
70 {
71     return i_reduce_max(max(extract256<0>(a), extract256<1>(a)));
72 }
73 #endif
74 
75 template<unsigned N>
i_reduce_max(const float32<N> & a)76 SIMDPP_INL float i_reduce_max(const float32<N>& a)
77 {
78     float32v r = a.vec(0);
79     for (unsigned i = 1; i < a.vec_length; ++i)
80         r = max(r, a.vec(i));
81     return i_reduce_max(r);
82 }
83 
84 // -----------------------------------------------------------------------------
85 
86 static SIMDPP_INL
i_reduce_max(const float64x2 & a)87 double i_reduce_max(const float64x2& a)
88 {
89 #if SIMDPP_USE_SSE2
90     float64x2 b = max(a, permute2<1,1>(a));
91     return _mm_cvtsd_f64(b.native());
92 #elif SIMDPP_USE_NEON64
93     return vmaxnmvq_f64(a.native());
94 #elif SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
95     float64x2 b = max(a, permute2<1,1>(a));
96     return extract<0>(b);
97 #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
98     double r = a.el(0);
99     for (unsigned i = 1; i < a.length; i++) {
100         r = r > a.el(i) ? r : a.el(i); // TODO nan
101     }
102     return r;
103 #endif
104 }
105 
106 #if SIMDPP_USE_AVX
107 static SIMDPP_INL
i_reduce_max(const float64x4 & a)108 double i_reduce_max(const float64x4& a)
109 {
110     float64x2 ah = detail::extract128<1>(a);
111     float64x2 al = detail::extract128<0>(a);
112     al = max(al, ah);
113     return i_reduce_max(al);
114 }
115 #endif
116 
117 #if SIMDPP_USE_AVX512F
118 static SIMDPP_INL
i_reduce_max(const float64<8> & a)119 double i_reduce_max(const float64<8>& a)
120 {
121     return i_reduce_max(max(extract256<0>(a), extract256<1>(a)));
122 }
123 #endif
124 
125 template<unsigned N>
i_reduce_max(const float64<N> & a)126 SIMDPP_INL double i_reduce_max(const float64<N>& a)
127 {
128     float64v r = a.vec(0);
129     for (unsigned i = 1; i < a.vec_length; ++i)
130         r = max(r, a.vec(i));
131     return i_reduce_max(r);
132 }
133 
134 } // namespace insn
135 } // namespace detail
136 } // namespace SIMDPP_ARCH_NAMESPACE
137 } // namespace simdpp
138 
139 #endif
140 
141