1 /* This file is part of the Vc library. {{{
2 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the names of contributing organizations nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 }}}*/
27
28 #ifndef VC_SSE_MATH_H_
29 #define VC_SSE_MATH_H_
30
31 #include "const.h"
32 #include "macros.h"
33
34 namespace Vc_VERSIONED_NAMESPACE
35 {
36 // copysign {{{1
copysign(SSE::float_v mag,SSE::float_v sign)37 Vc_INTRINSIC Vc_CONST SSE::float_v copysign(SSE::float_v mag, SSE::float_v sign)
38 {
39 return _mm_or_ps(_mm_and_ps(sign.data(), SSE::_mm_setsignmask_ps()),
40 _mm_and_ps(mag.data(), SSE::_mm_setabsmask_ps()));
41 }
copysign(SSE::double_v mag,SSE::double_v sign)42 Vc_INTRINSIC Vc_CONST SSE::double_v copysign(SSE::double_v mag, SSE::double_v sign)
43 {
44 return _mm_or_pd(_mm_and_pd(sign.data(), SSE::_mm_setsignmask_pd()),
45 _mm_and_pd(mag.data(), SSE::_mm_setabsmask_pd()));
46 }
47
48 //}}}1
49
50 /**
51 * splits \p v into exponent and mantissa, the sign is kept with the mantissa
52 *
53 * The return value will be in the range [0.5, 1.0[
54 * The \p e value will be an integer defining the power-of-two exponent
55 */
frexp(const SSE::double_v & v,SimdArray<int,2,Scalar::int_v,1> * e)56 inline SSE::double_v frexp(const SSE::double_v &v,
57 SimdArray<int, 2, Scalar::int_v, 1> *e)
58 {
59 const __m128i exponentBits = SSE::Const<double>::exponentMask().dataI();
60 const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits);
61 SSE::int_v exponent =
62 _mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe));
63 const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits));
64 SSE::double_v ret = _mm_and_pd(
65 exponentMaximized,
66 _mm_load_pd(reinterpret_cast<const double *>(&SSE::c_general::frexpMask[0])));
67 SSE::double_m zeroMask = v == SSE::double_v::Zero();
68 ret(isnan(v) || !isfinite(v) || zeroMask) = v;
69 exponent.setZero(zeroMask.data());
70 (*e)[0] = exponent[0];
71 (*e)[1] = exponent[2];
72 return ret;
73 }
frexp(const SSE::float_v & v,SimdArray<int,4,SSE::int_v,4> * e)74 inline SSE::float_v frexp(const SSE::float_v &v, SimdArray<int, 4, SSE::int_v, 4> *e)
75 {
76 const __m128i exponentBits = SSE::Const<float>::exponentMask().dataI();
77 const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits);
78 internal_data(*e) =
79 _mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e));
80 const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits));
81 SSE::float_v ret =
82 _mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu)));
83 ret(isnan(v) || !isfinite(v) || v == SSE::float_v::Zero()) = v;
84 e->setZero(v == SSE::float_v::Zero());
85 return ret;
86 }
87
88 /* -> x * 2^e
89 * x == NaN -> NaN
90 * x == (-)inf -> (-)inf
91 */
ldexp(SSE::double_v::AsArg v,const SimdArray<int,2,Scalar::int_v,1> & _e)92 inline SSE::double_v ldexp(SSE::double_v::AsArg v,
93 const SimdArray<int, 2, Scalar::int_v, 1> &_e)
94 {
95 SSE::int_v e = _mm_setr_epi32(_e[0], 0, _e[1], 0);
96 e.setZero((v == SSE::double_v::Zero()).dataI());
97 const __m128i exponentBits = _mm_slli_epi64(e.data(), 52);
98 return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits));
99 }
ldexp(SSE::float_v::AsArg v,const SimdArray<int,4,SSE::int_v,4> & _e)100 inline SSE::float_v ldexp(SSE::float_v::AsArg v,
101 const SimdArray<int, 4, SSE::int_v, 4> &_e)
102 {
103 SSE::int_v e = internal_data(_e);
104 e.setZero(simd_cast<SSE::int_m>(v == SSE::float_v::Zero()));
105 return reinterpret_components_cast<SSE::float_v>(
106 reinterpret_components_cast<SSE::int_v>(v) + (e << 23));
107 }
108
109 #ifdef Vc_IMPL_SSE4_1
trunc(SSE::double_v::AsArg v)110 inline SSE::double_v trunc(SSE::double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); }
trunc(SSE::float_v::AsArg v)111 inline SSE::float_v trunc(SSE::float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); }
112
floor(SSE::double_v::AsArg v)113 inline SSE::double_v floor(SSE::double_v::AsArg v) { return _mm_floor_pd(v.data()); }
floor(SSE::float_v::AsArg v)114 inline SSE::float_v floor(SSE::float_v::AsArg v) { return _mm_floor_ps(v.data()); }
115
ceil(SSE::double_v::AsArg v)116 inline SSE::double_v ceil(SSE::double_v::AsArg v) { return _mm_ceil_pd(v.data()); }
ceil(SSE::float_v::AsArg v)117 inline SSE::float_v ceil(SSE::float_v::AsArg v) { return _mm_ceil_ps(v.data()); }
118 #else
trunc(SSE::Vector<float> x)119 inline SSE::Vector<float> trunc(SSE::Vector<float> x)
120 {
121 const auto truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(x.data()));
122 const auto no_fractional_values = _mm_castsi128_ps(_mm_cmplt_epi32(
123 _mm_and_si128(_mm_castps_si128(x.data()), _mm_set1_epi32(0x7f800000u)),
124 _mm_set1_epi32(0x4b000000))); // the exponent is so large that no mantissa bits
125 // signify fractional values (0x3f8 + 23*8 = 0x4b0)
126 return _mm_or_ps(_mm_andnot_ps(no_fractional_values, x.data()),
127 _mm_and_ps(no_fractional_values, truncated));
128 }
129
trunc(SSE::Vector<double> x)130 inline SSE::Vector<double> trunc(SSE::Vector<double> x)
131 {
132 const auto abs_x = Vc::abs(x).data();
133 const auto min_no_fractional_bits =
134 _mm_castsi128_pd(_mm_set1_epi64x(0x4330000000000000ull)); // 0x3ff + 52 = 0x433
135 __m128d truncated =
136 _mm_sub_pd(_mm_add_pd(abs_x, min_no_fractional_bits), min_no_fractional_bits);
137 // due to rounding, the result can be too large. In this case `truncated >
138 // abs(x)` holds, so subtract 1 to truncated if `abs(x) < truncated`
139 truncated = _mm_sub_pd(truncated,
140 _mm_and_pd(_mm_cmplt_pd(abs_x, truncated), _mm_set1_pd(1.)));
141 // finally, fix the sign bit:
142 return _mm_or_pd(
143 _mm_and_pd(_mm_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ull)), x.data()),
144 truncated);
145 }
146
floor(SSE::Vector<T> x)147 template <typename T> inline SSE::Vector<T> floor(SSE::Vector<T> x)
148 {
149 auto y = trunc(x);
150 y(!(y == x) && x < 0) -= 1;
151 return y;
152 }
153
ceil(SSE::Vector<T> x)154 template <typename T> inline SSE::Vector<T> ceil(SSE::Vector<T> x)
155 {
156 auto y = trunc(x);
157 y(!(y == x || x < 0)) += 1;
158 return y;
159 }
160 #endif
161 // fma {{{1
162 template <typename T>
fma(Vector<T,VectorAbi::Sse> a,Vector<T,VectorAbi::Sse> b,Vector<T,VectorAbi::Sse> c)163 Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> fma(Vector<T, VectorAbi::Sse> a,
164 Vector<T, VectorAbi::Sse> b,
165 Vector<T, VectorAbi::Sse> c)
166 {
167 SSE::VectorHelper<T>::fma(a.data(), b.data(), c.data());
168 return a;
169 }
170 // }}}1
171 } // namespace Vc
172
173 #endif // VC_SSE_MATH_H_
174
175 // vim: foldmethod=marker
176