1 
2 /*
3  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 #if defined(TARGET_LINUX_POWER)
21 #include "xmm2altivec.h"
22 #elif defined(TARGET_LINUX_ARM64)
23 #include "arm64intrin.h"
24 #else
25 #include <immintrin.h>
26 #endif
27 #include <math.h>
28 #include "dasin_defs.h"
29 
30 #define FMA __builtin_fma
31 
32 extern "C" double __fsd_asin_fma3(double);
33 
__double_as_ll(double const a)34 static long long __double_as_ll(double const a) {
35     return *(long long*)&a;
36 }
37 
__ll_as_double(long long const a)38 static double __ll_as_double(long long const a) {
39     return *(double*)&a;
40 }
41 
42 
__fsd_asin_fma3(double const a)43 double __fsd_asin_fma3(double const a)
44 {
45     __m128d const AH0 = _mm_setr_pd(A0_D, H0_D);
46     __m128d const BI0 = _mm_setr_pd(B0_D, I0_D);
47     __m128d const CJ0 = _mm_setr_pd(C0_D, J0_D);
48     __m128d const DK0 = _mm_setr_pd(D0_D, K0_D);
49     __m128d const EL0 = _mm_setr_pd(E0_D, L0_D);
50     __m128d const FM0 = _mm_setr_pd(F0_D, M0_D);
51     __m128d const Ga0 = _mm_setr_pd(G0_D,    a);
52 
53     __m128d const AG1 = _mm_setr_pd(A1_D, G1_D);
54     __m128d const BH1 = _mm_setr_pd(B1_D, H1_D);
55     __m128d const CI1 = _mm_setr_pd(C1_D, I1_D);
56     __m128d const DJ1 = _mm_setr_pd(D1_D, J1_D);
57     __m128d const EK1 = _mm_setr_pd(E1_D, K1_D);
58     __m128d const FL1 = _mm_setr_pd(F1_D, L1_D);
59 
60     double res;
61     double x = __ll_as_double(ABS_MASK_LL & __double_as_ll(a));
62     double x2 = a * a;
63     double a3 = x2 * a;
64     double x6 = a3 * a3;
65 
66     if (__double_as_ll(x) < THRESHOLD_LL)
67     {
68         __m128d _x2 = _mm_set1_pd(x2);
69         __m128d _x2_a3 = _mm_setr_pd(x2, a3);
70         __m128d _p0;
71 
72         _p0 = _mm_fmadd_pd(AH0, _x2, BI0);
73         _p0 = _mm_fmadd_pd(_p0, _x2, CJ0);
74         _p0 = _mm_fmadd_pd(_p0, _x2, DK0);
75         _p0 = _mm_fmadd_pd(_p0, _x2, EL0);
76         _p0 = _mm_fmadd_pd(_p0, _x2, FM0);
77 
78         _p0 = _mm_fmadd_pd(_p0, _x2_a3, Ga0);
79 
80         double p0hi = _mm_cvtsd_f64(_p0);
81         _p0 = _mm_shuffle_pd(_p0, _p0, 3);
82         double p0lo = _mm_cvtsd_f64(_p0);
83 
84         double x12 = x6 * x6;
85         double a15 = x12 * a3;
86         res = FMA(p0hi, a15, p0lo);
87     }
88     else
89     {
90         double sq = 1.0 - x;
91         /*
92 	 * There seems to be a concensus that setting errno is important
93 	 * for fastmath intrinsics.
94 	 * Disable using Intel hardware instruction sqrt.
95 	 */
96 	sq = sqrt(sq);
97 
98         long long fix = (long long)(a > 1.0) << 63;
99         long long sign = SGN_MASK_LL & __double_as_ll(a);
100         fix = fix ^ sign;
101 
102         __m128d _x = _mm_set1_pd(x);
103         __m128d _p1;
104         _p1 = _mm_fmadd_pd(AG1, _x, BH1);
105         _p1 = _mm_fmadd_pd(_p1, _x, CI1);
106         _p1 = _mm_fmadd_pd(_p1, _x, DJ1);
107         _p1 = _mm_fmadd_pd(_p1, _x, EK1);
108         _p1 = _mm_fmadd_pd(_p1, _x, FL1);
109 
110         double p1hi = _mm_cvtsd_f64(_p1);
111         _p1 = _mm_shuffle_pd(_p1, _p1, 1);
112         double p1lo = _mm_cvtsd_f64(_p1);
113 
114         double p1 = FMA(p1hi, x6, p1lo);
115         p1 = FMA(sq, p1, PIO2_LO_D);
116         p1 = p1 + PIO2_HI_D;
117 
118         res = __ll_as_double(fix ^ __double_as_ll(p1));
119     }
120 
121 
122     return res;
123 }
124