1
2 /*
3 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19
20 #if defined(TARGET_LINUX_POWER)
21 #error "Source cannot be compiled for POWER architectures"
22 #include "xmm2altivec.h"
23 #else
24 #include <immintrin.h>
25 #endif
26 #include "dacos_defs.h"
27
28 extern "C" __m256d __fvd_acos_fma3_256(__m256d);
29
__fvd_acos_fma3_256(__m256d const a)30 __m256d __fvd_acos_fma3_256(__m256d const a)
31 {
32 __m256i const ABS_MASK = _mm256_set1_epi64x(ABS_MASK_LL);
33 __m256d const ZERO = _mm256_set1_pd(0.0);
34 __m256d const ONE = _mm256_set1_pd(1.0);
35 __m256d const SGN_MASK = (__m256d)_mm256_set1_epi64x(SGN_MASK_LL);
36 __m256d const THRESHOLD = _mm256_set1_pd(THRESHOLD_D);
37 __m256d const PI_HI = _mm256_set1_pd(PI_HI_D);
38
39 __m256d const A0 = _mm256_set1_pd(A0_D);
40 __m256d const B0 = _mm256_set1_pd(B0_D);
41 __m256d const C0 = _mm256_set1_pd(C0_D);
42 __m256d const D0 = _mm256_set1_pd(D0_D);
43 __m256d const E0 = _mm256_set1_pd(E0_D);
44 __m256d const F0 = _mm256_set1_pd(F0_D);
45 __m256d const G0 = _mm256_set1_pd(G0_D);
46 __m256d const H0 = _mm256_set1_pd(H0_D);
47 __m256d const I0 = _mm256_set1_pd(I0_D);
48 __m256d const J0 = _mm256_set1_pd(J0_D);
49 __m256d const K0 = _mm256_set1_pd(K0_D);
50 __m256d const L0 = _mm256_set1_pd(L0_D);
51 __m256d const M0 = _mm256_set1_pd(M0_D);
52 __m256d const N0 = _mm256_set1_pd(N0_D);
53
54 __m256d const A1 = _mm256_set1_pd(A1_D);
55 __m256d const B1 = _mm256_set1_pd(B1_D);
56 __m256d const C1 = _mm256_set1_pd(C1_D);
57 __m256d const D1 = _mm256_set1_pd(D1_D);
58 __m256d const E1 = _mm256_set1_pd(E1_D);
59 __m256d const F1 = _mm256_set1_pd(F1_D);
60 __m256d const G1 = _mm256_set1_pd(G1_D);
61 __m256d const H1 = _mm256_set1_pd(H1_D);
62 __m256d const I1 = _mm256_set1_pd(I1_D);
63 __m256d const J1 = _mm256_set1_pd(J1_D);
64 __m256d const K1 = _mm256_set1_pd(K1_D);
65 __m256d const L1 = _mm256_set1_pd(L1_D);
66 __m256d const M1 = _mm256_set1_pd(M1_D);
67
68 __m256d x, x2, a3, x6, x12, a15, c;
69 __m256d sq, p0, p1;
70 __m256d res, cmp, sign, fix;
71 __m256d p0hi, p0lo, p1hi, p1lo;
72
73 x = _mm256_and_pd(a, (__m256d)ABS_MASK);
74 x2 = _mm256_mul_pd(a, a);
75 sq = _mm256_sub_pd(ONE, x);
76 sq = _mm256_sqrt_pd(sq);
77
78 p1hi = _mm256_fmadd_pd(A1, x, B1);
79
80 p1hi = _mm256_fmadd_pd(p1hi, x, C1);
81 p1lo = _mm256_fmadd_pd(H1, x, I1);
82
83 p1hi = _mm256_fmadd_pd(p1hi, x, D1);
84 p1lo = _mm256_fmadd_pd(p1lo, x, J1);
85 p0hi = _mm256_fmadd_pd(A0, x2, B0);
86 p0lo = _mm256_fmadd_pd(H0, x2, I0);
87
88 p1hi = _mm256_fmadd_pd(p1hi, x, E1);
89 p1lo = _mm256_fmadd_pd(p1lo, x, K1);
90 p0hi = _mm256_fmadd_pd(p0hi, x2, C0);
91 p0lo = _mm256_fmadd_pd(p0lo, x2, J0);
92
93 a3 = _mm256_mul_pd(x2, a);
94 p1hi = _mm256_fmadd_pd(p1hi, x, F1);
95 p1lo = _mm256_fmadd_pd(p1lo, x, L1);
96 p0hi = _mm256_fmadd_pd(p0hi, x2, D0);
97 p0lo = _mm256_fmadd_pd(p0lo, x2, K0);
98
99 p1hi = _mm256_fmadd_pd(p1hi, x, G1);
100 x6 = _mm256_mul_pd(a3, a3);
101 p1lo = _mm256_fmadd_pd(p1lo, x, M1);
102 __m256d pi_mask = _mm256_cmp_pd(ZERO, a, _CMP_GT_OQ);
103 fix = _mm256_cmp_pd(a, ONE, _CMP_GT_OQ);
104 p0hi = _mm256_fmadd_pd(p0hi, x2, E0);
105 p0lo = _mm256_fmadd_pd(p0lo, x2, L0);
106
107 p1 = _mm256_fmadd_pd(p1hi, x6, p1lo);
108 __m256d pi_hi = _mm256_and_pd(pi_mask, PI_HI);
109 fix = _mm256_and_pd(fix, SGN_MASK);
110 sign = _mm256_and_pd(a, SGN_MASK);
111 p0hi = _mm256_fmadd_pd(p0hi, x2, F0);
112 x12 = _mm256_mul_pd(x6, x6);
113 p0lo = _mm256_fmadd_pd(p0lo, x2, M0);
114 c = _mm256_sub_pd(N0, a);
115
116 p1 = _mm256_fmsub_pd(sq, p1, pi_hi);
117 fix = _mm256_xor_pd(fix, sign);
118 p0hi = _mm256_fmadd_pd(p0hi, x2, G0);
119 a15 = _mm256_mul_pd(x12, a3);
120 p0lo = _mm256_fmadd_pd(p0lo, a3, c);
121
122 p1 = _mm256_xor_pd(p1, fix);
123 p0 = _mm256_fmadd_pd(p0hi, a15, p0lo);
124 cmp = _mm256_cmp_pd(x, THRESHOLD, _CMP_LT_OQ);
125
126 res = _mm256_blendv_pd(p1, p0, cmp);
127
128 return res;
129 }
130