1
2 /*
3 * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19 #include <immintrin.h>
20 #include "atan_defs.h"
21
22 extern "C" float __fss_atan_fma3(float);
23
__fss_atan_fma3(float const sa)24 float __fss_atan_fma3(float const sa) {
25 __m128 a = _mm_set1_ps(sa);
26 /* P = fpminimax(atan(x),[|1,3,5,7,9,11,13,15,17|],[|double...|],[0.000000001;1.0]); */
27 __m128 const VEC_INF = (__m128)_mm_set1_epi32(CONST_INF);
28 __m128 const VEC_SGN = (__m128)_mm_set1_epi32(CONST_SGN);
29 __m128 f_abs = _mm_and_ps(a, VEC_SGN);
30 __m128 f_sgn = _mm_xor_ps(f_abs, a);
31 __m128 inf_mask = _mm_cmp_ps(f_abs, VEC_INF, _CMP_EQ_OS);
32 __m128 const PI_HALF = _mm_set1_ps(CONST_PIOVER2);
33 __m256d const PI_HALF_D = _mm256_set1_pd(CONST_PIOVER2);
34
35 __m128 x;
36
37
38 __m128 f_rcp = _mm_rcp_ps(f_abs);
39
40 __m256d d_abs = _mm256_cvtps_pd(f_abs);
41
42 __m256d d_rcp = _mm256_cvtps_pd(f_rcp);
43
44 __m256d const VECD_CUT = _mm256_set1_pd(1.0);
45
46 __m256d d_x = _mm256_fnmadd_pd(d_rcp, d_abs, VECD_CUT);
47 d_x= _mm256_fmadd_pd(d_x,d_x,d_x);
48 __m256d rro_mask = _mm256_cmp_pd(d_abs, VECD_CUT, _CMP_GT_OS);
49 d_x = _mm256_fmadd_pd(d_rcp,d_x,d_rcp);
50 d_x = _mm256_blendv_pd(d_abs, d_x, rro_mask);
51
52 __m256d const C0 = _mm256_set1_pd(DBL17_C0);
53 __m256d const C1 = _mm256_set1_pd(DBL17_C1);
54 __m256d const C2 = _mm256_set1_pd(DBL17_C2);
55 __m256d const C3 = _mm256_set1_pd(DBL17_C3);
56 __m256d const C4 = _mm256_set1_pd(DBL17_C4);
57 __m256d const C5 = _mm256_set1_pd(DBL17_C5);
58 __m256d const C6 = _mm256_set1_pd(DBL17_C6);
59 __m256d const C7 = _mm256_set1_pd(DBL17_C7);
60 __m256d const C8 = _mm256_set1_pd(DBL17_C8);
61
62 __m256d x2 = _mm256_mul_pd(d_x, d_x);
63
64 __m256d A3 = _mm256_fmadd_pd(x2, C8, C7);
65 __m256d A2 = _mm256_fmadd_pd(x2, C5, C4);
66 __m256d A1 = _mm256_fmadd_pd(x2, C2, C1);
67
68 __m256d x6 = _mm256_mul_pd(x2, x2);
69
70 A3 = _mm256_fmadd_pd(x2, A3, C6);
71 A2 = _mm256_fmadd_pd(x2, A2, C3);
72 A1 = _mm256_fmadd_pd(x2, A1, C0);
73
74 x6 = _mm256_mul_pd(x6, x2);
75
76 A2 = _mm256_fmadd_pd(A3, x6, A2);
77
78 A1 = _mm256_fmadd_pd(A2, x6, A1);
79
80 d_x = _mm256_mul_pd(d_x, A1);
81
82 __m256d t = _mm256_sub_pd(PI_HALF_D, d_x);
83 d_x = _mm256_blendv_pd(d_x, t, rro_mask);
84 x = _mm256_cvtpd_ps(d_x);
85
86 x = _mm_blendv_ps(x, PI_HALF, inf_mask);
87
88 x = _mm_or_ps(x, f_sgn);
89 return _mm_cvtss_f32(x);
90 }
91
92