1 
2 /*
3  * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 #include <common.h>
20 #include <immintrin.h>
21 #include <math.h>
22 
23 
24 #if !(defined _CPU)
25 #error: please define _CPU - specific suffix to a function name
26 #endif
27 
28 #define _JOIN2(a,b) a##b
29 #define JOIN2(a,b) _JOIN2(a,b)
30 
31 #define atan_scalar JOIN2(__fs_atan_1_,_CPU)
32 #define FMAF __builtin_fmaf
33 
34 extern "C" float atan_scalar(float);
35 
36 
atan_scalar(const float x)37 float __attribute__((noinline)) atan_scalar(const float x) {
38 
39     bool xBig = (fabsf(x) > 1.0f);
40 
41     float xReduced = x;
42 
43     if (xBig){
44         xReduced = 1.0f / x;
45     }
46 
47     float x2 = xReduced*xReduced;
48 
49     // We evaluate the polynomial using the Horner scheme:
50     float x4 = x2 * x2;
51     float x8 = x4 * x4;
52 
53     // First layer of Estrin:
54     float L1 = FMAF(x2, C2, C1);
55     float L2 = FMAF(x2, C4, C3);
56     float L3 = FMAF(x2, C6, C5);
57     float L4 = FMAF(x2, C8, C7);
58 
59 
60     // We now want:
61     // L1 + x4*L2 + x8*L3 + x12*L4 =
62     //(L1 + x4*L2) + x8*(L3 + x4*L4)
63     // Second layer of estrin
64     float M1 = FMAF(x4, L2, L1);
65     float M2 = FMAF(x4, L4, L3);
66 
67     float poly = FMAF(x8, M2, M1);
68 
69 
70     if (xBig) {
71         const float signedPi = copysignf(PI_2, x);
72 
73         float result_d = FMAF(-x2 * xReduced, poly, (signedPi - xReduced));
74         return result_d;
75     }
76 
77     float result_d = FMAF(x2 * xReduced, poly, xReduced);
78 
79     //This fixes atanf(-0.0) = -0.0, but doesn't slow down the code seemingly
80     result_d = copysignf(result_d, x);
81 
82     return result_d;
83 }
84