1
2 /*
3 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19
20 #if defined(TARGET_LINUX_POWER)
21 #include "xmm2altivec.h"
22 #elif defined(TARGET_LINUX_ARM64)
23 #include "arm64intrin.h"
24 #else
25 #include <immintrin.h>
26 #endif
27 #include <math.h>
28 #include "dasin_defs.h"
29
30 #define FMA __builtin_fma
31
32 extern "C" double __fsd_asin_fma3(double);
33
__double_as_ll(double const a)34 static long long __double_as_ll(double const a) {
35 return *(long long*)&a;
36 }
37
__ll_as_double(long long const a)38 static double __ll_as_double(long long const a) {
39 return *(double*)&a;
40 }
41
42
__fsd_asin_fma3(double const a)43 double __fsd_asin_fma3(double const a)
44 {
45 __m128d const AH0 = _mm_setr_pd(A0_D, H0_D);
46 __m128d const BI0 = _mm_setr_pd(B0_D, I0_D);
47 __m128d const CJ0 = _mm_setr_pd(C0_D, J0_D);
48 __m128d const DK0 = _mm_setr_pd(D0_D, K0_D);
49 __m128d const EL0 = _mm_setr_pd(E0_D, L0_D);
50 __m128d const FM0 = _mm_setr_pd(F0_D, M0_D);
51 __m128d const Ga0 = _mm_setr_pd(G0_D, a);
52
53 __m128d const AG1 = _mm_setr_pd(A1_D, G1_D);
54 __m128d const BH1 = _mm_setr_pd(B1_D, H1_D);
55 __m128d const CI1 = _mm_setr_pd(C1_D, I1_D);
56 __m128d const DJ1 = _mm_setr_pd(D1_D, J1_D);
57 __m128d const EK1 = _mm_setr_pd(E1_D, K1_D);
58 __m128d const FL1 = _mm_setr_pd(F1_D, L1_D);
59
60 double res;
61 double x = __ll_as_double(ABS_MASK_LL & __double_as_ll(a));
62 double x2 = a * a;
63 double a3 = x2 * a;
64 double x6 = a3 * a3;
65
66 if (__double_as_ll(x) < THRESHOLD_LL)
67 {
68 __m128d _x2 = _mm_set1_pd(x2);
69 __m128d _x2_a3 = _mm_setr_pd(x2, a3);
70 __m128d _p0;
71
72 _p0 = _mm_fmadd_pd(AH0, _x2, BI0);
73 _p0 = _mm_fmadd_pd(_p0, _x2, CJ0);
74 _p0 = _mm_fmadd_pd(_p0, _x2, DK0);
75 _p0 = _mm_fmadd_pd(_p0, _x2, EL0);
76 _p0 = _mm_fmadd_pd(_p0, _x2, FM0);
77
78 _p0 = _mm_fmadd_pd(_p0, _x2_a3, Ga0);
79
80 double p0hi = _mm_cvtsd_f64(_p0);
81 _p0 = _mm_shuffle_pd(_p0, _p0, 3);
82 double p0lo = _mm_cvtsd_f64(_p0);
83
84 double x12 = x6 * x6;
85 double a15 = x12 * a3;
86 res = FMA(p0hi, a15, p0lo);
87 }
88 else
89 {
90 double sq = 1.0 - x;
91 /*
92 * There seems to be a concensus that setting errno is important
93 * for fastmath intrinsics.
94 * Disable using Intel hardware instruction sqrt.
95 */
96 sq = sqrt(sq);
97
98 long long fix = (long long)(a > 1.0) << 63;
99 long long sign = SGN_MASK_LL & __double_as_ll(a);
100 fix = fix ^ sign;
101
102 __m128d _x = _mm_set1_pd(x);
103 __m128d _p1;
104 _p1 = _mm_fmadd_pd(AG1, _x, BH1);
105 _p1 = _mm_fmadd_pd(_p1, _x, CI1);
106 _p1 = _mm_fmadd_pd(_p1, _x, DJ1);
107 _p1 = _mm_fmadd_pd(_p1, _x, EK1);
108 _p1 = _mm_fmadd_pd(_p1, _x, FL1);
109
110 double p1hi = _mm_cvtsd_f64(_p1);
111 _p1 = _mm_shuffle_pd(_p1, _p1, 1);
112 double p1lo = _mm_cvtsd_f64(_p1);
113
114 double p1 = FMA(p1hi, x6, p1lo);
115 p1 = FMA(sq, p1, PIO2_LO_D);
116 p1 = p1 + PIO2_HI_D;
117
118 res = __ll_as_double(fix ^ __double_as_ll(p1));
119 }
120
121
122 return res;
123 }
124