1 
2 /*
3  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 #if defined(TARGET_LINUX_POWER)
20 #include "xmm2altivec.h"
21 #elif defined(TARGET_LINUX_ARM64)
22 #include "arm64intrin.h"
23 #else
24 #include <immintrin.h>
25 #endif
26 #include <math.h>
27 #include "dacos_defs.h"
28 
29 #define FMA __builtin_fma
30 
31 extern "C" double __fsd_acos_fma3(double);
32 
__double_as_ll(double const a)33 static long long __double_as_ll(double const a) {
34     return *(long long*)&a;
35 }
36 
__ll_as_double(long long const a)37 static double __ll_as_double(long long const a) {
38     return *(double*)&a;
39 }
40 
__fsd_acos_fma3(double const a)41 double __fsd_acos_fma3(double const a)
42 {
43     __m128d const AH0 = _mm_setr_pd(A0_D, H0_D);
44     __m128d const BI0 = _mm_setr_pd(B0_D, I0_D);
45     __m128d const CJ0 = _mm_setr_pd(C0_D, J0_D);
46     __m128d const DK0 = _mm_setr_pd(D0_D, K0_D);
47     __m128d const EL0 = _mm_setr_pd(E0_D, L0_D);
48     __m128d const FM0 = _mm_setr_pd(F0_D, M0_D);
49     __m128d const GN0 = _mm_setr_pd(G0_D, N0_D - a);
50 
51     __m128d const AH1 = _mm_setr_pd(A1_D, H1_D);
52     __m128d const BI1 = _mm_setr_pd(B1_D, I1_D);
53     __m128d const CJ1 = _mm_setr_pd(C1_D, J1_D);
54     __m128d const DK1 = _mm_setr_pd(D1_D, K1_D);
55     __m128d const EL1 = _mm_setr_pd(E1_D, L1_D);
56     __m128d const FM1 = _mm_setr_pd(F1_D, M1_D);
57 
58     double res;
59     double x = __ll_as_double(ABS_MASK_LL & __double_as_ll(a));
60     double x2 = a * a;
61     double a3 = x2 * a;
62     double x6 = a3 * a3;
63 
64     if (__double_as_ll(x) >= __double_as_ll(THRESHOLD_D))
65     {
66         double sq = 1.0 - x;
67 	/*
68 	 * There seems to be a concensus that setting errno is important
69 	 * for fastmath intrinsics.
70 	 * Disable using Intel hardware instruction sqrt.
71 	 */
72         sq = sqrt(sq);
73 
74         double pi_hi = a < 0.0 ? PI_HI_D : 0.0;
75         long long fix = (long long)(a > 1.0) << 63;
76         long long sign = SGN_MASK_LL & __double_as_ll(a);
77         fix = fix ^ sign;
78 
79         __m128d _x = _mm_set1_pd(x);
80         __m128d _p1;
81         _p1 = _mm_fmadd_pd(AH1, _x, BI1);
82         _p1 = _mm_fmadd_pd(_p1, _x, CJ1);
83         _p1 = _mm_fmadd_pd(_p1, _x, DK1);
84         _p1 = _mm_fmadd_pd(_p1, _x, EL1);
85         _p1 = _mm_fmadd_pd(_p1, _x, FM1);
86 
87         double p1hi = _mm_cvtsd_f64(_p1);
88         _p1 = _mm_shuffle_pd(_p1, _p1, 3);
89         double p1lo = _mm_cvtsd_f64(_p1);
90 
91         p1hi = FMA(p1hi, x, G1_D);
92         double p1 = FMA(p1hi, x6, p1lo);
93         p1 = FMA(sq, p1, -pi_hi);
94 
95         res = __ll_as_double(fix ^ __double_as_ll(p1));
96     }
97     else
98     {
99         __m128d _x2 = _mm_set1_pd(x2);
100         __m128d _x2_a3 = _mm_setr_pd(x2, a3);
101         __m128d _p0;
102 
103         _p0 = _mm_fmadd_pd(AH0, _x2, BI0);
104         _p0 = _mm_fmadd_pd(_p0, _x2, CJ0);
105         _p0 = _mm_fmadd_pd(_p0, _x2, DK0);
106         _p0 = _mm_fmadd_pd(_p0, _x2, EL0);
107         _p0 = _mm_fmadd_pd(_p0, _x2, FM0);
108         _p0 = _mm_fmadd_pd(_p0, _x2_a3, GN0);
109 
110         double p0hi = _mm_cvtsd_f64(_p0);
111         _p0 = _mm_shuffle_pd(_p0, _p0, 3);
112         double p0lo = _mm_cvtsd_f64(_p0);
113 
114         double x12 = x6 * x6;
115         double a15 = x12 * a3;
116         res = FMA(p0hi, a15, p0lo);
117     }
118 
119     return res;
120 }
121 
122 
123