/* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ #include #include #if !(defined _CPU) #error: please define _CPU - specific suffix to a function name #endif #define _JOIN2(a,b) a##b #define JOIN2(a,b) _JOIN2(a,b) #define atan2_d_scalar JOIN2(__fd_atan2_1_,_CPU) #define FMA __builtin_fma extern "C" double atan2_d_scalar(double,double); static unsigned long long int __attribute__ ((always_inline)) as_ulong(double x) { return *(unsigned long long int *)&x; } static double __attribute__ ((always_inline)) as_double(unsigned long long int x) { return *(double *)&x; } // We use the relationship between atan2(y,x) and atan(x), // as described in: // https://en.wikipedia.org/wiki/Atan2 // to create an atan2(y, x) implementation based on our atan(x) implementation // Namely: // atan2(y, x) = atan(y/x) for x > 0 // atan2(y, x) = atan(y/x) + pi for x < 0 and y >= 0 // atan2(y, x) = atan(y/x) - pi for x < 0 and y < 0 // atan2(y, x) = pi/2 for x = 0 and y > 0 // atan2(y, x) = -pi/2 for x = 0 and y < 0 // Also, from the C99 standards we need: // atan2(+-0.0, +0.0) = +-0.0 // atan2(+-0.0, -0.0) = +-PI // Special care need to be taken when both x and y are +-INFINITY, where // the ieee definitions are equivalent to letting x and y tend to infinity at // the same rate. double atan2_d_scalar(double const y, double const x) { // Special return values when both inputs are infinity, or 0's // (or any absolute equal numbers, the results are the same): if (__builtin_expect(fabs(y) == fabs(x), 1)) { // Using (as_ulong(x) > 0x7FFFFFFFFFFFFFFF) rather than (x < 0.0) here // seems to give a performance boost when looping over this function // many times double ans = FMA((as_ulong(x) > 0x7FFFFFFFFFFFFFFF), PI_OVER_2, PI_OVER_4); // Special return values for (y, x) = (+-0.0, +-0.0) if (x == 0.0) { ans = (as_ulong(x) == 0x0) ? 0.0 : PI; } return copysign(ans, y); } double xReduced; // xReduced = ((fabs(y) > fabs(x)) ? x : y) / ((fabs(y) > fabs(x)) ? y : x); // Seems to be the fastest way of getting x/y or y/x: if (fabs(y) > fabs(x)) { xReduced = x / y; } else { xReduced = y / x; } // The same Estrin scheme as is used in atan(x): double x2 = xReduced * xReduced; double x4 = x2 * x2; double x8 = x4 * x4; double x16 = x8 * x8; double L1 = FMA(x2, C6, C5); double L2 = FMA(x2, C8, C7); double L3 = FMA(x2, C10, C9); double L4 = FMA(x2, C12, C11); double L5 = FMA(x2, C14, C13); double L6 = FMA(x2, C16, C15); double L7 = FMA(x2, C18, C17); double L8 = FMA(x2, C20, C19); // L1 + x4*L2 + x8*L3 + x12*L4 + x16*L5 + x20*L6 + x24*L7 + x28*L8 double M1 = FMA(x4, L2, L1); double M2 = FMA(x4, L4, L3); double M3 = FMA(x4, L6, L5); double M4 = FMA(x4, L8, L7); // M1 + x8*M2 + x16*M3 + x24*M4 // (M1 + x8*M2) + x16*(M3 + x8*M4) // (M1 + x8*M2) + x16*(M3 + x8*M4) double N1 = FMA(x8, M2, M1); double N2 = FMA(x8, M4, M3); // c2 + x2*c3 + x4*c4 + x6*(N1 + x16*N2): double poly = FMA(x16, N2, N1); poly = FMA(x4, FMA(x2, poly, C4), FMA(x2, C3, C2)); double result_d = poly; double pi_factor = 0.0; if (fabs(y) > fabs(x)) { // pi/2 with the sign of xReduced: const double signedPi_2 = as_double( as_ulong(PI_OVER_2) | (as_ulong(xReduced) & 0x8000000000000000)); xReduced = -xReduced; pi_factor = signedPi_2; } result_d = FMA(x2 * xReduced, poly, xReduced); // Faster than the if statement: // pi with the sign of y: const double signedPi = as_double(as_ulong(PI) | (as_ulong(y) & 0x8000000000000000)); // Again, faster: pi_factor = FMA((as_ulong(x) > 0x7FFFFFFFFFFFFFFF), signedPi, pi_factor); // if ((as_ulong(x) > 0x7FFFFFFFFFFFFFFF)) { // pi_factor += signedPi; //} result_d += pi_factor; // Unfortunately we have to do a copysign here to return the correctly // signed 0.0 for atan2(+-0.0, x > 0.0) inputs: result_d = copysign(result_d, y); #ifdef IACA #pragma message("IACA END") IACA_END #endif return result_d; }