1
2 /*
3 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19 #include <common.h>
20 #include <math.h>
21
22 #if !(defined _CPU)
23 #error: please define _CPU - specific suffix to a function name
24 #endif
25
26 #define _JOIN2(a,b) a##b
27 #define JOIN2(a,b) _JOIN2(a,b)
28
29 #define atan2_d_scalar JOIN2(__fd_atan2_1_,_CPU)
30 #define FMA __builtin_fma
31
32 extern "C" double atan2_d_scalar(double,double);
33
34
as_ulong(double x)35 static unsigned long long int __attribute__ ((always_inline)) as_ulong(double x)
36 {
37 return *(unsigned long long int *)&x;
38 }
39
as_double(unsigned long long int x)40 static double __attribute__ ((always_inline)) as_double(unsigned long long int x)
41 {
42 return *(double *)&x;
43 }
44
45 // We use the relationship between atan2(y,x) and atan(x),
46 // as described in:
47 // https://en.wikipedia.org/wiki/Atan2
48 // to create an atan2(y, x) implementation based on our atan(x) implementation
49 // Namely:
50 // atan2(y, x) = atan(y/x) for x > 0
51 // atan2(y, x) = atan(y/x) + pi for x < 0 and y >= 0
52 // atan2(y, x) = atan(y/x) - pi for x < 0 and y < 0
53 // atan2(y, x) = pi/2 for x = 0 and y > 0
54 // atan2(y, x) = -pi/2 for x = 0 and y < 0
55
56 // Also, from the C99 standards we need:
57 // atan2(+-0.0, +0.0) = +-0.0
58 // atan2(+-0.0, -0.0) = +-PI
59
60 // Special care need to be taken when both x and y are +-INFINITY, where
61 // the ieee definitions are equivalent to letting x and y tend to infinity at
62 // the same rate.
63
atan2_d_scalar(double const y,double const x)64 double atan2_d_scalar(double const y, double const x) {
65
66 // Special return values when both inputs are infinity, or 0's
67 // (or any absolute equal numbers, the results are the same):
68 if (__builtin_expect(fabs(y) == fabs(x), 1)) {
69
70 // Using (as_ulong(x) > 0x7FFFFFFFFFFFFFFF) rather than (x < 0.0) here
71 // seems to give a performance boost when looping over this function
72 // many times
73 double ans =
74 FMA((as_ulong(x) > 0x7FFFFFFFFFFFFFFF), PI_OVER_2, PI_OVER_4);
75
76 // Special return values for (y, x) = (+-0.0, +-0.0)
77 if (x == 0.0) {
78 ans = (as_ulong(x) == 0x0) ? 0.0 : PI;
79 }
80
81 return copysign(ans, y);
82 }
83
84 double xReduced;
85
86 // xReduced = ((fabs(y) > fabs(x)) ? x : y) / ((fabs(y) > fabs(x)) ? y : x);
87 // Seems to be the fastest way of getting x/y or y/x:
88 if (fabs(y) > fabs(x)) {
89 xReduced = x / y;
90 } else {
91 xReduced = y / x;
92 }
93
94 // The same Estrin scheme as is used in atan(x):
95 double x2 = xReduced * xReduced;
96 double x4 = x2 * x2;
97 double x8 = x4 * x4;
98 double x16 = x8 * x8;
99
100 double L1 = FMA(x2, C6, C5);
101 double L2 = FMA(x2, C8, C7);
102 double L3 = FMA(x2, C10, C9);
103 double L4 = FMA(x2, C12, C11);
104 double L5 = FMA(x2, C14, C13);
105 double L6 = FMA(x2, C16, C15);
106 double L7 = FMA(x2, C18, C17);
107 double L8 = FMA(x2, C20, C19);
108
109 // L1 + x4*L2 + x8*L3 + x12*L4 + x16*L5 + x20*L6 + x24*L7 + x28*L8
110 double M1 = FMA(x4, L2, L1);
111 double M2 = FMA(x4, L4, L3);
112 double M3 = FMA(x4, L6, L5);
113 double M4 = FMA(x4, L8, L7);
114
115 // M1 + x8*M2 + x16*M3 + x24*M4
116 // (M1 + x8*M2) + x16*(M3 + x8*M4)
117 // (M1 + x8*M2) + x16*(M3 + x8*M4)
118 double N1 = FMA(x8, M2, M1);
119 double N2 = FMA(x8, M4, M3);
120
121 // c2 + x2*c3 + x4*c4 + x6*(N1 + x16*N2):
122 double poly = FMA(x16, N2, N1);
123
124 poly = FMA(x4, FMA(x2, poly, C4), FMA(x2, C3, C2));
125
126 double result_d = poly;
127
128 double pi_factor = 0.0;
129
130 if (fabs(y) > fabs(x)) {
131 // pi/2 with the sign of xReduced:
132 const double signedPi_2 = as_double(
133 as_ulong(PI_OVER_2) | (as_ulong(xReduced) & 0x8000000000000000));
134
135 xReduced = -xReduced;
136 pi_factor = signedPi_2;
137 }
138
139 result_d = FMA(x2 * xReduced, poly, xReduced);
140
141 // Faster than the if statement:
142 // pi with the sign of y:
143 const double signedPi =
144 as_double(as_ulong(PI) | (as_ulong(y) & 0x8000000000000000));
145
146 // Again, faster:
147 pi_factor = FMA((as_ulong(x) > 0x7FFFFFFFFFFFFFFF), signedPi, pi_factor);
148 // if ((as_ulong(x) > 0x7FFFFFFFFFFFFFFF)) {
149 // pi_factor += signedPi;
150 //}
151
152 result_d += pi_factor;
153
154 // Unfortunately we have to do a copysign here to return the correctly
155 // signed 0.0 for atan2(+-0.0, x > 0.0) inputs:
156 result_d = copysign(result_d, y);
157
158 #ifdef IACA
159 #pragma message("IACA END")
160 IACA_END
161 #endif
162
163 return result_d;
164 }
165