1 /*
2  * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17 
18 
19 #if defined(TARGET_LINUX_POWER)
20 #error "Source cannot be compiled for POWER architectures"
21 #include "xmm2altivec.h"
22 #else
23 #include <immintrin.h>
24 #include "mth_avx512helper.h"
25 #endif
26 #include "dacos_defs.h"
27 
28 extern "C" __m512d FCN_AVX512(__fvd_acos_fma3)(__m512d);
29 
FCN_AVX512(__fvd_acos_fma3)30 __m512d FCN_AVX512(__fvd_acos_fma3)(__m512d const a)
31 {
32     __m512i const ABS_MASK  = _mm512_set1_epi64(ABS_MASK_LL);
33     __m512d const ZERO      = _mm512_set1_pd(0.0);
34     __m512d const ONE       = _mm512_set1_pd(1.0);
35     __m512d const SGN_MASK  = (__m512d)_mm512_set1_epi64(SGN_MASK_LL);
36     __m512d const THRESHOLD = _mm512_set1_pd(THRESHOLD_D);
37     __m512d const PI_HI     = _mm512_set1_pd(PI_HI_D);
38 
39     __m512d const A0 = _mm512_set1_pd(A0_D);
40     __m512d const B0 = _mm512_set1_pd(B0_D);
41     __m512d const C0 = _mm512_set1_pd(C0_D);
42     __m512d const D0 = _mm512_set1_pd(D0_D);
43     __m512d const E0 = _mm512_set1_pd(E0_D);
44     __m512d const F0 = _mm512_set1_pd(F0_D);
45     __m512d const G0 = _mm512_set1_pd(G0_D);
46     __m512d const H0 = _mm512_set1_pd(H0_D);
47     __m512d const I0 = _mm512_set1_pd(I0_D);
48     __m512d const J0 = _mm512_set1_pd(J0_D);
49     __m512d const K0 = _mm512_set1_pd(K0_D);
50     __m512d const L0 = _mm512_set1_pd(L0_D);
51     __m512d const M0 = _mm512_set1_pd(M0_D);
52     __m512d const N0 = _mm512_set1_pd(N0_D);
53 
54     __m512d const A1 = _mm512_set1_pd(A1_D);
55     __m512d const B1 = _mm512_set1_pd(B1_D);
56     __m512d const C1 = _mm512_set1_pd(C1_D);
57     __m512d const D1 = _mm512_set1_pd(D1_D);
58     __m512d const E1 = _mm512_set1_pd(E1_D);
59     __m512d const F1 = _mm512_set1_pd(F1_D);
60     __m512d const G1 = _mm512_set1_pd(G1_D);
61     __m512d const H1 = _mm512_set1_pd(H1_D);
62     __m512d const I1 = _mm512_set1_pd(I1_D);
63     __m512d const J1 = _mm512_set1_pd(J1_D);
64     __m512d const K1 = _mm512_set1_pd(K1_D);
65     __m512d const L1 = _mm512_set1_pd(L1_D);
66     __m512d const M1 = _mm512_set1_pd(M1_D);
67 
68     __m512d x, x2, a3, x6, x12, a15, c;
69     __m512d sq, p0, p1;
70     __m512d res, cmp, sign, fix;
71     __m512d p0hi, p0lo, p1hi, p1lo;
72 
73     x  = _MM512_AND_PD(a, (__m512d)ABS_MASK);
74     x2 = _mm512_mul_pd(a, a);
75     sq = _mm512_sub_pd(ONE, x);
76     sq = _mm512_sqrt_pd(sq);
77 
78     p1hi = _mm512_fmadd_pd(A1, x, B1);
79 
80     p1hi = _mm512_fmadd_pd(p1hi, x, C1);
81     p1lo = _mm512_fmadd_pd(H1, x, I1);
82 
83     p1hi = _mm512_fmadd_pd(p1hi, x, D1);
84     p1lo = _mm512_fmadd_pd(p1lo, x, J1);
85     p0hi = _mm512_fmadd_pd(A0, x2, B0);
86     p0lo = _mm512_fmadd_pd(H0, x2, I0);
87 
88     p1hi = _mm512_fmadd_pd(p1hi, x, E1);
89     p1lo = _mm512_fmadd_pd(p1lo, x, K1);
90     p0hi = _mm512_fmadd_pd(p0hi, x2, C0);
91     p0lo = _mm512_fmadd_pd(p0lo, x2, J0);
92 
93     a3 = _mm512_mul_pd(x2, a);
94     p1hi = _mm512_fmadd_pd(p1hi, x, F1);
95     p1lo = _mm512_fmadd_pd(p1lo, x, L1);
96     p0hi = _mm512_fmadd_pd(p0hi, x2, D0);
97     p0lo = _mm512_fmadd_pd(p0lo, x2, K0);
98 
99     p1hi = _mm512_fmadd_pd(p1hi, x, G1);
100     x6 = _mm512_mul_pd(a3, a3);
101     p1lo = _mm512_fmadd_pd(p1lo, x, M1);
102     __m512d pi_mask = _MM512_CMP_PD(ZERO, a, _CMP_GT_OQ);
103     fix = _MM512_CMP_PD(a, ONE, _CMP_GT_OQ);
104     p0hi = _mm512_fmadd_pd(p0hi, x2, E0);
105     p0lo = _mm512_fmadd_pd(p0lo, x2, L0);
106 
107     p1 = _mm512_fmadd_pd(p1hi, x6, p1lo);
108     __m512d pi_hi = _MM512_AND_PD(pi_mask, PI_HI);
109     fix = _MM512_AND_PD(fix, SGN_MASK);
110     sign = _MM512_AND_PD(a, SGN_MASK);
111     p0hi = _mm512_fmadd_pd(p0hi, x2, F0);
112     x12 = _mm512_mul_pd(x6, x6);
113     p0lo = _mm512_fmadd_pd(p0lo, x2, M0);
114     c = _mm512_sub_pd(N0, a);
115 
116     p1 = _mm512_fmsub_pd(sq, p1, pi_hi);
117     fix = _MM512_XOR_PD(fix, sign);
118     p0hi = _mm512_fmadd_pd(p0hi, x2, G0);
119     a15 = _mm512_mul_pd(x12, a3);
120     p0lo = _mm512_fmadd_pd(p0lo, a3, c);
121 
122     p1 = _MM512_XOR_PD(p1, fix);
123     p0 = _mm512_fmadd_pd(p0hi, a15, p0lo);
124     cmp = _MM512_CMP_PD(x, THRESHOLD, _CMP_LT_OQ);
125 
126     res = _MM512_BLENDV_PD(p1, p0, cmp);
127 
128     return res;
129 }
130