1
2 /*
3 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19 #include <common.h>
20 #include <immintrin.h>
21 #include <math.h>
22
23
24 #if !(defined _CPU)
25 #error: please define _CPU - specific suffix to a function name
26 #endif
27
28 #define _JOIN2(a,b) a##b
29 #define JOIN2(a,b) _JOIN2(a,b)
30
31 #define atan_scalar JOIN2(__fs_atan_1_,_CPU)
32 #define FMAF __builtin_fmaf
33
34 extern "C" float atan_scalar(float);
35
36
atan_scalar(const float x)37 float __attribute__((noinline)) atan_scalar(const float x) {
38
39 bool xBig = (fabsf(x) > 1.0f);
40
41 float xReduced = x;
42
43 if (xBig){
44 xReduced = 1.0f / x;
45 }
46
47 float x2 = xReduced*xReduced;
48
49 // We evaluate the polynomial using the Horner scheme:
50 float x4 = x2 * x2;
51 float x8 = x4 * x4;
52
53 // First layer of Estrin:
54 float L1 = FMAF(x2, C2, C1);
55 float L2 = FMAF(x2, C4, C3);
56 float L3 = FMAF(x2, C6, C5);
57 float L4 = FMAF(x2, C8, C7);
58
59
60 // We now want:
61 // L1 + x4*L2 + x8*L3 + x12*L4 =
62 //(L1 + x4*L2) + x8*(L3 + x4*L4)
63 // Second layer of estrin
64 float M1 = FMAF(x4, L2, L1);
65 float M2 = FMAF(x4, L4, L3);
66
67 float poly = FMAF(x8, M2, M1);
68
69
70 if (xBig) {
71 const float signedPi = copysignf(PI_2, x);
72
73 float result_d = FMAF(-x2 * xReduced, poly, (signedPi - xReduced));
74 return result_d;
75 }
76
77 float result_d = FMAF(x2 * xReduced, poly, xReduced);
78
79 //This fixes atanf(-0.0) = -0.0, but doesn't slow down the code seemingly
80 result_d = copysignf(result_d, x);
81
82 return result_d;
83 }
84