1 ///////////////////////////////////////////////////////////////////////////
2 // Inastemp - Berenger Bramas MPCDF - 2016
3 // Under MIT Licence, please you must read the LICENCE file.
4 ///////////////////////////////////////////////////////////////////////////
5 #ifndef INAVECSSE41DOUBLE_HPP
6 #define INAVECSSE41DOUBLE_HPP
7 
8 #include "SSSE3/InaVecSSSE3Double.hpp"
9 
10 #ifndef INASTEMP_USE_SSE41
11 #error InaVecSSE41<double> is included but SSE41 is not enable in the configuration
12 #endif
13 
14 #include <tmmintrin.h>
15 #include <emmintrin.h>
16 #include <smmintrin.h>
17 
18 template <class RealType>
19 class InaVecSSE41;
20 
21 template <>
22 class alignas(16) InaVecSSE41<double> : public InaVecSSSE3<double> {
23     using Parent = InaVecSSSE3<double>;
24 
25 public:
26     using InaVecSSSE3<double>::InaVecSSSE3;
27 
InaVecSSE41()28     inline InaVecSSE41(){}
29 
InaVecSSE41(const InaVecSSSE3<double> & other)30     inline InaVecSSE41(const InaVecSSSE3<double>& other)
31         : Parent(other){}
32 
33     // Re-put exp to benefit from floor
exp() const34     inline InaVecSSE41<double> exp() const {
35 #ifdef __INTEL_COMPILER
36         return _mm_exp_pd(Parent::vec);
37 #else
38         const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
39         const __m128d COEFF_A     = _mm_set1_pd(double(InaFastExp::CoeffA64()));
40         const __m128d COEFF_B     = _mm_set1_pd(double(InaFastExp::CoeffB64()));
41         const __m128d COEFF_P5_X  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_8()));
42         const __m128d COEFF_P5_Y  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_7()));
43         const __m128d COEFF_P5_Z  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_6()));
44         const __m128d COEFF_P5_A  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_5()));
45         const __m128d COEFF_P5_B  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_4()));
46         const __m128d COEFF_P5_C  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_3()));
47         const __m128d COEFF_P5_D  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_2()));
48         const __m128d COEFF_P5_E  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_1()));
49         const __m128d COEFF_P5_F  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_0()));
50 
51         __m128d x = _mm_mul_pd(Parent::vec, COEFF_LOG2E);
52 
53         const __m128d fractional_part = _mm_sub_pd(x, InaVecSSE41(x).floor().vec);
54 
55         __m128d factor = _mm_add_pd(_mm_mul_pd(_mm_add_pd(
56                          _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(
57                          _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(
58                          _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(_mm_mul_pd(
59                          COEFF_P5_X, fractional_part), COEFF_P5_Y), fractional_part),
60                          COEFF_P5_Z),fractional_part), COEFF_P5_A), fractional_part),
61                          COEFF_P5_B), fractional_part), COEFF_P5_C),fractional_part),
62                          COEFF_P5_D), fractional_part), COEFF_P5_E),fractional_part),
63                          COEFF_P5_F);
64 
65         x = _mm_sub_pd(x,factor);
66 
67         x = _mm_add_pd(_mm_mul_pd(COEFF_A, x), COEFF_B);
68 
69         alignas(64) long int allvalint[VecLength] = { _mm_cvtsd_si64(x),
70                                                       _mm_cvtsd_si64(_mm_shuffle_pd(x, x, 1)) };
71 
72         return _mm_castsi128_pd(_mm_set_epi64x(allvalint[1], allvalint[0]));
73 #endif
74     }
75 
expLowAcc() const76     inline InaVecSSE41<double> expLowAcc() const {
77         const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
78         const __m128d COEFF_A     = _mm_set1_pd(double(InaFastExp::CoeffA64()));
79         const __m128d COEFF_B     = _mm_set1_pd(double(InaFastExp::CoeffB64()));
80         const __m128d COEFF_P5_C  = _mm_set1_pd(double(InaFastExp::GetCoefficient4_3()));
81         const __m128d COEFF_P5_D  = _mm_set1_pd(double(InaFastExp::GetCoefficient4_2()));
82         const __m128d COEFF_P5_E  = _mm_set1_pd(double(InaFastExp::GetCoefficient4_1()));
83         const __m128d COEFF_P5_F  = _mm_set1_pd(double(InaFastExp::GetCoefficient4_0()));
84 
85         __m128d x = _mm_mul_pd(Parent::vec, COEFF_LOG2E);
86 
87         const __m128d fractional_part = _mm_sub_pd(x, InaVecSSE41(x).floor().vec);
88 
89         __m128d factor = _mm_add_pd(_mm_mul_pd(_mm_add_pd(
90                          _mm_mul_pd(_mm_add_pd(_mm_mul_pd(
91                                          COEFF_P5_C, fractional_part),
92                                          COEFF_P5_D), fractional_part),
93                                          COEFF_P5_E), fractional_part),
94                                          COEFF_P5_F);
95 
96         x = _mm_sub_pd(x,factor);
97 
98         x = _mm_add_pd(_mm_mul_pd(COEFF_A, x), COEFF_B);
99 
100         alignas(64) long int allvalint[VecLength] = { _mm_cvtsd_si64(x),
101                                                       _mm_cvtsd_si64(_mm_shuffle_pd(x, x, 1)) };
102 
103         return _mm_castsi128_pd(_mm_set_epi64x(allvalint[1], allvalint[0]));
104     }
105 
floor() const106     inline InaVecSSE41<double> floor() const {
107         return _mm_floor_pd(Parent::vec);
108     }
109 
GetName()110     inline static const char* GetName(){
111         return "InaVecSSE41<double>";
112     }
113 
If(const typename Parent::MaskType & inTest)114     inline static InaIfElse< InaVecSSE41<double> >::ThenClass If(const typename Parent::MaskType& inTest) {
115         return InaIfElse< InaVecSSE41<double> >::IfClass().If(inTest);
116     }
117 };
118 
119 #endif
120