1 
2 /*
3  * Copyright (c) 2016-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 #include <immintrin.h>
20 #include "atan_defs.h"
21 
22 extern "C" float __fss_atan_fma3(float);
23 
__fss_atan_fma3(float const sa)24 float __fss_atan_fma3(float const sa) {
25     __m128 a = _mm_set1_ps(sa);
26 /* P = fpminimax(atan(x),[|1,3,5,7,9,11,13,15,17|],[|double...|],[0.000000001;1.0]); */
27     __m128 const VEC_INF = (__m128)_mm_set1_epi32(CONST_INF);
28     __m128 const VEC_SGN = (__m128)_mm_set1_epi32(CONST_SGN);
29     __m128 f_abs = _mm_and_ps(a, VEC_SGN);
30     __m128 f_sgn = _mm_xor_ps(f_abs, a);
31     __m128 inf_mask = _mm_cmp_ps(f_abs, VEC_INF, _CMP_EQ_OS);
32     __m128 const PI_HALF = _mm_set1_ps(CONST_PIOVER2);
33     __m256d const PI_HALF_D = _mm256_set1_pd(CONST_PIOVER2);
34 
35     __m128 x;
36 
37 
38     __m128 f_rcp = _mm_rcp_ps(f_abs);
39 
40     __m256d d_abs = _mm256_cvtps_pd(f_abs);
41 
42     __m256d d_rcp = _mm256_cvtps_pd(f_rcp);
43 
44     __m256d const VECD_CUT = _mm256_set1_pd(1.0);
45 
46     __m256d d_x = _mm256_fnmadd_pd(d_rcp, d_abs, VECD_CUT);
47             d_x= _mm256_fmadd_pd(d_x,d_x,d_x);
48     __m256d rro_mask = _mm256_cmp_pd(d_abs, VECD_CUT, _CMP_GT_OS);
49             d_x = _mm256_fmadd_pd(d_rcp,d_x,d_rcp);
50             d_x = _mm256_blendv_pd(d_abs, d_x, rro_mask);
51 
52     __m256d const C0 = _mm256_set1_pd(DBL17_C0);
53     __m256d const C1 = _mm256_set1_pd(DBL17_C1);
54     __m256d const C2 = _mm256_set1_pd(DBL17_C2);
55     __m256d const C3 = _mm256_set1_pd(DBL17_C3);
56     __m256d const C4 = _mm256_set1_pd(DBL17_C4);
57     __m256d const C5 = _mm256_set1_pd(DBL17_C5);
58     __m256d const C6 = _mm256_set1_pd(DBL17_C6);
59     __m256d const C7 = _mm256_set1_pd(DBL17_C7);
60     __m256d const C8 = _mm256_set1_pd(DBL17_C8);
61 
62     __m256d x2 = _mm256_mul_pd(d_x, d_x);
63 
64     __m256d A3 = _mm256_fmadd_pd(x2, C8, C7);
65     __m256d A2 = _mm256_fmadd_pd(x2, C5, C4);
66     __m256d A1 = _mm256_fmadd_pd(x2, C2, C1);
67 
68     __m256d x6 = _mm256_mul_pd(x2, x2);
69 
70             A3 = _mm256_fmadd_pd(x2, A3, C6);
71             A2 = _mm256_fmadd_pd(x2, A2, C3);
72             A1 = _mm256_fmadd_pd(x2, A1, C0);
73 
74             x6 = _mm256_mul_pd(x6, x2);
75 
76             A2 = _mm256_fmadd_pd(A3, x6, A2);
77 
78             A1 = _mm256_fmadd_pd(A2, x6, A1);
79 
80             d_x = _mm256_mul_pd(d_x, A1);
81 
82     __m256d t = _mm256_sub_pd(PI_HALF_D, d_x);
83     d_x = _mm256_blendv_pd(d_x, t, rro_mask);
84     x = _mm256_cvtpd_ps(d_x);
85 
86     x = _mm_blendv_ps(x, PI_HALF, inf_mask);
87 
88     x = _mm_or_ps(x, f_sgn);
89         return _mm_cvtss_f32(x);
90 }
91 
92