1 /*
2 * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17
18
19 #if defined(TARGET_LINUX_POWER)
20 #error "Source cannot be compiled for POWER architectures"
21 #include "xmm2altivec.h"
22 #else
23 #include <immintrin.h>
24 #include "mth_avx512helper.h"
25 #endif
26 #include "dacos_defs.h"
27
28 extern "C" __m512d FCN_AVX512(__fvd_acos_fma3)(__m512d);
29
FCN_AVX512(__fvd_acos_fma3)30 __m512d FCN_AVX512(__fvd_acos_fma3)(__m512d const a)
31 {
32 __m512i const ABS_MASK = _mm512_set1_epi64(ABS_MASK_LL);
33 __m512d const ZERO = _mm512_set1_pd(0.0);
34 __m512d const ONE = _mm512_set1_pd(1.0);
35 __m512d const SGN_MASK = (__m512d)_mm512_set1_epi64(SGN_MASK_LL);
36 __m512d const THRESHOLD = _mm512_set1_pd(THRESHOLD_D);
37 __m512d const PI_HI = _mm512_set1_pd(PI_HI_D);
38
39 __m512d const A0 = _mm512_set1_pd(A0_D);
40 __m512d const B0 = _mm512_set1_pd(B0_D);
41 __m512d const C0 = _mm512_set1_pd(C0_D);
42 __m512d const D0 = _mm512_set1_pd(D0_D);
43 __m512d const E0 = _mm512_set1_pd(E0_D);
44 __m512d const F0 = _mm512_set1_pd(F0_D);
45 __m512d const G0 = _mm512_set1_pd(G0_D);
46 __m512d const H0 = _mm512_set1_pd(H0_D);
47 __m512d const I0 = _mm512_set1_pd(I0_D);
48 __m512d const J0 = _mm512_set1_pd(J0_D);
49 __m512d const K0 = _mm512_set1_pd(K0_D);
50 __m512d const L0 = _mm512_set1_pd(L0_D);
51 __m512d const M0 = _mm512_set1_pd(M0_D);
52 __m512d const N0 = _mm512_set1_pd(N0_D);
53
54 __m512d const A1 = _mm512_set1_pd(A1_D);
55 __m512d const B1 = _mm512_set1_pd(B1_D);
56 __m512d const C1 = _mm512_set1_pd(C1_D);
57 __m512d const D1 = _mm512_set1_pd(D1_D);
58 __m512d const E1 = _mm512_set1_pd(E1_D);
59 __m512d const F1 = _mm512_set1_pd(F1_D);
60 __m512d const G1 = _mm512_set1_pd(G1_D);
61 __m512d const H1 = _mm512_set1_pd(H1_D);
62 __m512d const I1 = _mm512_set1_pd(I1_D);
63 __m512d const J1 = _mm512_set1_pd(J1_D);
64 __m512d const K1 = _mm512_set1_pd(K1_D);
65 __m512d const L1 = _mm512_set1_pd(L1_D);
66 __m512d const M1 = _mm512_set1_pd(M1_D);
67
68 __m512d x, x2, a3, x6, x12, a15, c;
69 __m512d sq, p0, p1;
70 __m512d res, cmp, sign, fix;
71 __m512d p0hi, p0lo, p1hi, p1lo;
72
73 x = _MM512_AND_PD(a, (__m512d)ABS_MASK);
74 x2 = _mm512_mul_pd(a, a);
75 sq = _mm512_sub_pd(ONE, x);
76 sq = _mm512_sqrt_pd(sq);
77
78 p1hi = _mm512_fmadd_pd(A1, x, B1);
79
80 p1hi = _mm512_fmadd_pd(p1hi, x, C1);
81 p1lo = _mm512_fmadd_pd(H1, x, I1);
82
83 p1hi = _mm512_fmadd_pd(p1hi, x, D1);
84 p1lo = _mm512_fmadd_pd(p1lo, x, J1);
85 p0hi = _mm512_fmadd_pd(A0, x2, B0);
86 p0lo = _mm512_fmadd_pd(H0, x2, I0);
87
88 p1hi = _mm512_fmadd_pd(p1hi, x, E1);
89 p1lo = _mm512_fmadd_pd(p1lo, x, K1);
90 p0hi = _mm512_fmadd_pd(p0hi, x2, C0);
91 p0lo = _mm512_fmadd_pd(p0lo, x2, J0);
92
93 a3 = _mm512_mul_pd(x2, a);
94 p1hi = _mm512_fmadd_pd(p1hi, x, F1);
95 p1lo = _mm512_fmadd_pd(p1lo, x, L1);
96 p0hi = _mm512_fmadd_pd(p0hi, x2, D0);
97 p0lo = _mm512_fmadd_pd(p0lo, x2, K0);
98
99 p1hi = _mm512_fmadd_pd(p1hi, x, G1);
100 x6 = _mm512_mul_pd(a3, a3);
101 p1lo = _mm512_fmadd_pd(p1lo, x, M1);
102 __m512d pi_mask = _MM512_CMP_PD(ZERO, a, _CMP_GT_OQ);
103 fix = _MM512_CMP_PD(a, ONE, _CMP_GT_OQ);
104 p0hi = _mm512_fmadd_pd(p0hi, x2, E0);
105 p0lo = _mm512_fmadd_pd(p0lo, x2, L0);
106
107 p1 = _mm512_fmadd_pd(p1hi, x6, p1lo);
108 __m512d pi_hi = _MM512_AND_PD(pi_mask, PI_HI);
109 fix = _MM512_AND_PD(fix, SGN_MASK);
110 sign = _MM512_AND_PD(a, SGN_MASK);
111 p0hi = _mm512_fmadd_pd(p0hi, x2, F0);
112 x12 = _mm512_mul_pd(x6, x6);
113 p0lo = _mm512_fmadd_pd(p0lo, x2, M0);
114 c = _mm512_sub_pd(N0, a);
115
116 p1 = _mm512_fmsub_pd(sq, p1, pi_hi);
117 fix = _MM512_XOR_PD(fix, sign);
118 p0hi = _mm512_fmadd_pd(p0hi, x2, G0);
119 a15 = _mm512_mul_pd(x12, a3);
120 p0lo = _mm512_fmadd_pd(p0lo, a3, c);
121
122 p1 = _MM512_XOR_PD(p1, fix);
123 p0 = _mm512_fmadd_pd(p0hi, a15, p0lo);
124 cmp = _MM512_CMP_PD(x, THRESHOLD, _CMP_LT_OQ);
125
126 res = _MM512_BLENDV_PD(p1, p0, cmp);
127
128 return res;
129 }
130