1 /////////////////////////////////////////////////////////////////////////// 2 // Inastemp - Berenger Bramas MPCDF - 2016 3 // Under MIT Licence, please you must read the LICENCE file. 4 /////////////////////////////////////////////////////////////////////////// 5 #ifndef INAVECSSE41DOUBLE_HPP 6 #define INAVECSSE41DOUBLE_HPP 7 8 #include "SSSE3/InaVecSSSE3Double.hpp" 9 10 #ifndef INASTEMP_USE_SSE41 11 #error InaVecSSE41<double> is included but SSE41 is not enable in the configuration 12 #endif 13 14 #include <tmmintrin.h> 15 #include <emmintrin.h> 16 #include <smmintrin.h> 17 18 template <class RealType> 19 class InaVecSSE41; 20 21 template <> 22 class alignas(16) InaVecSSE41<double> : public InaVecSSSE3<double> { 23 using Parent = InaVecSSSE3<double>; 24 25 public: 26 using InaVecSSSE3<double>::InaVecSSSE3; 27 InaVecSSE41()28 inline InaVecSSE41(){} 29 InaVecSSE41(const InaVecSSSE3<double> & other)30 inline InaVecSSE41(const InaVecSSSE3<double>& other) 31 : Parent(other){} 32 33 // Re-put exp to benefit from floor exp() const34 inline InaVecSSE41<double> exp() const { 35 #ifdef __INTEL_COMPILER 36 return _mm_exp_pd(Parent::vec); 37 #else 38 const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E())); 39 const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64())); 40 const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64())); 41 const __m128d COEFF_P5_X = _mm_set1_pd(double(InaFastExp::GetCoefficient9_8())); 42 const __m128d COEFF_P5_Y = _mm_set1_pd(double(InaFastExp::GetCoefficient9_7())); 43 const __m128d COEFF_P5_Z = _mm_set1_pd(double(InaFastExp::GetCoefficient9_6())); 44 const __m128d COEFF_P5_A = _mm_set1_pd(double(InaFastExp::GetCoefficient9_5())); 45 const __m128d COEFF_P5_B = _mm_set1_pd(double(InaFastExp::GetCoefficient9_4())); 46 const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient9_3())); 47 const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient9_2())); 48 const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient9_1())); 49 const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient9_0())); 50 51 __m128d x = _mm_mul_pd(Parent::vec, COEFF_LOG2E); 52 53 const __m128d fractional_part = _mm_sub_pd(x, InaVecSSE41(x).floor().vec); 54 55 __m128d factor = _mm_add_pd(_mm_mul_pd(_mm_add_pd( 56 _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd( 57 _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd( 58 _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(_mm_mul_pd( 59 COEFF_P5_X, fractional_part), COEFF_P5_Y), fractional_part), 60 COEFF_P5_Z),fractional_part), COEFF_P5_A), fractional_part), 61 COEFF_P5_B), fractional_part), COEFF_P5_C),fractional_part), 62 COEFF_P5_D), fractional_part), COEFF_P5_E),fractional_part), 63 COEFF_P5_F); 64 65 x = _mm_sub_pd(x,factor); 66 67 x = _mm_add_pd(_mm_mul_pd(COEFF_A, x), COEFF_B); 68 69 alignas(64) long int allvalint[VecLength] = { _mm_cvtsd_si64(x), 70 _mm_cvtsd_si64(_mm_shuffle_pd(x, x, 1)) }; 71 72 return _mm_castsi128_pd(_mm_set_epi64x(allvalint[1], allvalint[0])); 73 #endif 74 } 75 expLowAcc() const76 inline InaVecSSE41<double> expLowAcc() const { 77 const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E())); 78 const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64())); 79 const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64())); 80 const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient4_3())); 81 const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient4_2())); 82 const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient4_1())); 83 const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient4_0())); 84 85 __m128d x = _mm_mul_pd(Parent::vec, COEFF_LOG2E); 86 87 const __m128d fractional_part = _mm_sub_pd(x, InaVecSSE41(x).floor().vec); 88 89 __m128d factor = _mm_add_pd(_mm_mul_pd(_mm_add_pd( 90 _mm_mul_pd(_mm_add_pd(_mm_mul_pd( 91 COEFF_P5_C, fractional_part), 92 COEFF_P5_D), fractional_part), 93 COEFF_P5_E), fractional_part), 94 COEFF_P5_F); 95 96 x = _mm_sub_pd(x,factor); 97 98 x = _mm_add_pd(_mm_mul_pd(COEFF_A, x), COEFF_B); 99 100 alignas(64) long int allvalint[VecLength] = { _mm_cvtsd_si64(x), 101 _mm_cvtsd_si64(_mm_shuffle_pd(x, x, 1)) }; 102 103 return _mm_castsi128_pd(_mm_set_epi64x(allvalint[1], allvalint[0])); 104 } 105 floor() const106 inline InaVecSSE41<double> floor() const { 107 return _mm_floor_pd(Parent::vec); 108 } 109 GetName()110 inline static const char* GetName(){ 111 return "InaVecSSE41<double>"; 112 } 113 If(const typename Parent::MaskType & inTest)114 inline static InaIfElse< InaVecSSE41<double> >::ThenClass If(const typename Parent::MaskType& inTest) { 115 return InaIfElse< InaVecSSE41<double> >::IfClass().If(inTest); 116 } 117 }; 118 119 #endif 120