1 /*  This file is part of the Vc library. {{{
2 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #ifndef VC_SSE_MATH_H_
29 #define VC_SSE_MATH_H_
30 
31 #include "const.h"
32 #include "macros.h"
33 
34 namespace Vc_VERSIONED_NAMESPACE
35 {
36 // copysign {{{1
copysign(SSE::float_v mag,SSE::float_v sign)37 Vc_INTRINSIC Vc_CONST SSE::float_v copysign(SSE::float_v mag, SSE::float_v sign)
38 {
39     return _mm_or_ps(_mm_and_ps(sign.data(), SSE::_mm_setsignmask_ps()),
40                      _mm_and_ps(mag.data(), SSE::_mm_setabsmask_ps()));
41 }
copysign(SSE::double_v mag,SSE::double_v sign)42 Vc_INTRINSIC Vc_CONST SSE::double_v copysign(SSE::double_v mag, SSE::double_v sign)
43 {
44     return _mm_or_pd(_mm_and_pd(sign.data(), SSE::_mm_setsignmask_pd()),
45                      _mm_and_pd(mag.data(), SSE::_mm_setabsmask_pd()));
46 }
47 
48 //}}}1
49 
50 /**
51  * splits \p v into exponent and mantissa, the sign is kept with the mantissa
52  *
53  * The return value will be in the range [0.5, 1.0[
54  * The \p e value will be an integer defining the power-of-two exponent
55  */
frexp(const SSE::double_v & v,SimdArray<int,2,Scalar::int_v,1> * e)56 inline SSE::double_v frexp(const SSE::double_v &v,
57                            SimdArray<int, 2, Scalar::int_v, 1> *e)
58 {
59     const __m128i exponentBits = SSE::Const<double>::exponentMask().dataI();
60     const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits);
61     SSE::int_v exponent =
62         _mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe));
63     const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits));
64     SSE::double_v ret = _mm_and_pd(
65         exponentMaximized,
66         _mm_load_pd(reinterpret_cast<const double *>(&SSE::c_general::frexpMask[0])));
67     SSE::double_m zeroMask = v == SSE::double_v::Zero();
68     ret(isnan(v) || !isfinite(v) || zeroMask) = v;
69     exponent.setZero(zeroMask.data());
70     (*e)[0] = exponent[0];
71     (*e)[1] = exponent[2];
72     return ret;
73 }
frexp(const SSE::float_v & v,SimdArray<int,4,SSE::int_v,4> * e)74 inline SSE::float_v frexp(const SSE::float_v &v, SimdArray<int, 4, SSE::int_v, 4> *e)
75 {
76     const __m128i exponentBits = SSE::Const<float>::exponentMask().dataI();
77     const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits);
78     internal_data(*e) =
79         _mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e));
80     const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits));
81     SSE::float_v ret =
82         _mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu)));
83     ret(isnan(v) || !isfinite(v) || v == SSE::float_v::Zero()) = v;
84     e->setZero(v == SSE::float_v::Zero());
85     return ret;
86 }
87 
88 /*             -> x * 2^e
89  * x == NaN    -> NaN
90  * x == (-)inf -> (-)inf
91  */
ldexp(SSE::double_v::AsArg v,const SimdArray<int,2,Scalar::int_v,1> & _e)92 inline SSE::double_v ldexp(SSE::double_v::AsArg v,
93                            const SimdArray<int, 2, Scalar::int_v, 1> &_e)
94 {
95     SSE::int_v e = _mm_setr_epi32(_e[0], 0, _e[1], 0);
96     e.setZero((v == SSE::double_v::Zero()).dataI());
97     const __m128i exponentBits = _mm_slli_epi64(e.data(), 52);
98     return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits));
99 }
ldexp(SSE::float_v::AsArg v,const SimdArray<int,4,SSE::int_v,4> & _e)100 inline SSE::float_v ldexp(SSE::float_v::AsArg v,
101                           const SimdArray<int, 4, SSE::int_v, 4> &_e)
102 {
103     SSE::int_v e = internal_data(_e);
104     e.setZero(simd_cast<SSE::int_m>(v == SSE::float_v::Zero()));
105     return reinterpret_components_cast<SSE::float_v>(
106         reinterpret_components_cast<SSE::int_v>(v) + (e << 23));
107 }
108 
109 #ifdef Vc_IMPL_SSE4_1
trunc(SSE::double_v::AsArg v)110 inline SSE::double_v trunc(SSE::double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); }
trunc(SSE::float_v::AsArg v)111 inline SSE::float_v trunc(SSE::float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); }
112 
floor(SSE::double_v::AsArg v)113 inline SSE::double_v floor(SSE::double_v::AsArg v) { return _mm_floor_pd(v.data()); }
floor(SSE::float_v::AsArg v)114 inline SSE::float_v floor(SSE::float_v::AsArg v) { return _mm_floor_ps(v.data()); }
115 
ceil(SSE::double_v::AsArg v)116 inline SSE::double_v ceil(SSE::double_v::AsArg v) { return _mm_ceil_pd(v.data()); }
ceil(SSE::float_v::AsArg v)117 inline SSE::float_v ceil(SSE::float_v::AsArg v) { return _mm_ceil_ps(v.data()); }
118 #else
trunc(SSE::Vector<float> x)119 inline SSE::Vector<float> trunc(SSE::Vector<float> x)
120 {
121     const auto truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(x.data()));
122     const auto no_fractional_values = _mm_castsi128_ps(_mm_cmplt_epi32(
123         _mm_and_si128(_mm_castps_si128(x.data()), _mm_set1_epi32(0x7f800000u)),
124         _mm_set1_epi32(0x4b000000)));  // the exponent is so large that no mantissa bits
125                                        // signify fractional values (0x3f8 + 23*8 = 0x4b0)
126     return _mm_or_ps(_mm_andnot_ps(no_fractional_values, x.data()),
127                      _mm_and_ps(no_fractional_values, truncated));
128 }
129 
trunc(SSE::Vector<double> x)130 inline SSE::Vector<double> trunc(SSE::Vector<double> x)
131 {
132     const auto abs_x = Vc::abs(x).data();
133     const auto min_no_fractional_bits =
134         _mm_castsi128_pd(_mm_set1_epi64x(0x4330000000000000ull));  // 0x3ff + 52 = 0x433
135     __m128d truncated =
136         _mm_sub_pd(_mm_add_pd(abs_x, min_no_fractional_bits), min_no_fractional_bits);
137     // due to rounding, the result can be too large. In this case `truncated >
138     // abs(x)` holds, so subtract 1 to truncated if `abs(x) < truncated`
139     truncated = _mm_sub_pd(truncated,
140                            _mm_and_pd(_mm_cmplt_pd(abs_x, truncated), _mm_set1_pd(1.)));
141     // finally, fix the sign bit:
142     return _mm_or_pd(
143         _mm_and_pd(_mm_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ull)), x.data()),
144         truncated);
145 }
146 
floor(SSE::Vector<T> x)147 template <typename T> inline SSE::Vector<T> floor(SSE::Vector<T> x)
148 {
149     auto y = trunc(x);
150     y(!(y == x) && x < 0) -= 1;
151     return y;
152 }
153 
ceil(SSE::Vector<T> x)154 template <typename T> inline SSE::Vector<T> ceil(SSE::Vector<T> x)
155 {
156     auto y = trunc(x);
157     y(!(y == x || x < 0)) += 1;
158     return y;
159 }
160 #endif
161 // fma {{{1
162 template <typename T>
fma(Vector<T,VectorAbi::Sse> a,Vector<T,VectorAbi::Sse> b,Vector<T,VectorAbi::Sse> c)163 Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> fma(Vector<T, VectorAbi::Sse> a,
164                                                Vector<T, VectorAbi::Sse> b,
165                                                Vector<T, VectorAbi::Sse> c)
166 {
167     SSE::VectorHelper<T>::fma(a.data(), b.data(), c.data());
168     return a;
169 }
170 // }}}1
171 }  // namespace Vc
172 
173 #endif // VC_SSE_MATH_H_
174 
175 // vim: foldmethod=marker
176