1 
2 /*
3  * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 #include <common.h>
20 #include <math.h>
21 
22 #if !(defined _CPU)
23 #error: please define _CPU - specific suffix to a function name
24 #endif
25 
26 #define _JOIN2(a,b) a##b
27 #define JOIN2(a,b) _JOIN2(a,b)
28 
29 #define atan2_d_scalar JOIN2(__fd_atan2_1_,_CPU)
30 #define FMA __builtin_fma
31 
32 extern "C" double atan2_d_scalar(double,double);
33 
34 
as_ulong(double x)35 static unsigned long long int __attribute__ ((always_inline)) as_ulong(double x)
36 {
37     return *(unsigned long long int *)&x;
38 }
39 
as_double(unsigned long long int x)40 static double __attribute__ ((always_inline)) as_double(unsigned long long int x)
41 {
42     return *(double *)&x;
43 }
44 
45 // We use the relationship between atan2(y,x) and atan(x),
46 // as described in:
47 // https://en.wikipedia.org/wiki/Atan2
48 // to create an atan2(y, x) implementation based on our atan(x) implementation
49 // Namely:
50 // atan2(y, x) = atan(y/x) for x > 0
51 // atan2(y, x) = atan(y/x) + pi for x < 0 and y >= 0
52 // atan2(y, x) = atan(y/x) - pi for x < 0 and y < 0
53 // atan2(y, x) =  pi/2 for x = 0 and y > 0
54 // atan2(y, x) = -pi/2 for x = 0 and y < 0
55 
56 // Also, from the C99 standards we need:
57 // atan2(+-0.0, +0.0) = +-0.0
58 // atan2(+-0.0, -0.0) = +-PI
59 
60 // Special care need to be taken when both x and y are +-INFINITY, where
61 // the ieee definitions are equivalent to letting x and y tend to infinity at
62 // the same rate.
63 
atan2_d_scalar(double const y,double const x)64 double atan2_d_scalar(double const y, double const x) {
65 
66     // Special return values when both inputs are infinity, or 0's
67     // (or any absolute equal numbers, the results are the same):
68     if (__builtin_expect(fabs(y) == fabs(x), 1)) {
69 
70         // Using (as_ulong(x) > 0x7FFFFFFFFFFFFFFF) rather than (x < 0.0) here
71         // seems to give a performance boost when looping over this function
72         // many times
73         double ans =
74             FMA((as_ulong(x) > 0x7FFFFFFFFFFFFFFF), PI_OVER_2, PI_OVER_4);
75 
76         // Special return values for (y, x) = (+-0.0, +-0.0)
77         if (x == 0.0) {
78             ans = (as_ulong(x) == 0x0) ? 0.0 : PI;
79         }
80 
81         return copysign(ans, y);
82     }
83 
84     double xReduced;
85 
86     // xReduced = ((fabs(y) > fabs(x)) ? x : y) / ((fabs(y) > fabs(x)) ? y : x);
87     // Seems to be the fastest way of getting x/y or y/x:
88     if (fabs(y) > fabs(x)) {
89         xReduced = x / y;
90     } else {
91         xReduced = y / x;
92     }
93 
94     // The same Estrin scheme as is used in atan(x):
95     double x2 = xReduced * xReduced;
96     double x4 = x2 * x2;
97     double x8 = x4 * x4;
98     double x16 = x8 * x8;
99 
100     double L1 = FMA(x2, C6, C5);
101     double L2 = FMA(x2, C8, C7);
102     double L3 = FMA(x2, C10, C9);
103     double L4 = FMA(x2, C12, C11);
104     double L5 = FMA(x2, C14, C13);
105     double L6 = FMA(x2, C16, C15);
106     double L7 = FMA(x2, C18, C17);
107     double L8 = FMA(x2, C20, C19);
108 
109     // L1 + x4*L2 + x8*L3 + x12*L4 + x16*L5 + x20*L6 + x24*L7 + x28*L8
110     double M1 = FMA(x4, L2, L1);
111     double M2 = FMA(x4, L4, L3);
112     double M3 = FMA(x4, L6, L5);
113     double M4 = FMA(x4, L8, L7);
114 
115     // M1 + x8*M2 + x16*M3 + x24*M4
116     // (M1 + x8*M2) + x16*(M3 + x8*M4)
117     // (M1 + x8*M2) + x16*(M3 + x8*M4)
118     double N1 = FMA(x8, M2, M1);
119     double N2 = FMA(x8, M4, M3);
120 
121     // c2 + x2*c3 + x4*c4 + x6*(N1 + x16*N2):
122     double poly = FMA(x16, N2, N1);
123 
124     poly = FMA(x4, FMA(x2, poly, C4), FMA(x2, C3, C2));
125 
126     double result_d = poly;
127 
128     double pi_factor = 0.0;
129 
130     if (fabs(y) > fabs(x)) {
131         // pi/2 with the sign of xReduced:
132         const double signedPi_2 = as_double(
133             as_ulong(PI_OVER_2) | (as_ulong(xReduced) & 0x8000000000000000));
134 
135         xReduced = -xReduced;
136         pi_factor = signedPi_2;
137     }
138 
139     result_d = FMA(x2 * xReduced, poly, xReduced);
140 
141     // Faster than the if statement:
142     // pi with the sign of y:
143     const double signedPi =
144         as_double(as_ulong(PI) | (as_ulong(y) & 0x8000000000000000));
145 
146     // Again, faster:
147     pi_factor = FMA((as_ulong(x) > 0x7FFFFFFFFFFFFFFF), signedPi, pi_factor);
148     // if ((as_ulong(x) > 0x7FFFFFFFFFFFFFFF)) {
149     //    pi_factor += signedPi;
150     //}
151 
152     result_d += pi_factor;
153 
154     // Unfortunately we have to do a copysign here to return the correctly
155     // signed 0.0 for atan2(+-0.0, x > 0.0) inputs:
156     result_d = copysign(result_d, y);
157 
158 #ifdef IACA
159 #pragma message("IACA END")
160     IACA_END
161 #endif
162 
163     return result_d;
164 }
165