1 
2 /*
3  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 #if defined(TARGET_LINUX_POWER)
21 #error "Source cannot be compiled for POWER architectures"
22 #include "xmm2altivec.h"
23 #else
24 #include <immintrin.h>
25 #endif
26 #include "dacos_defs.h"
27 
28 extern "C" __m256d __fvd_acos_fma3_256(__m256d);
29 
__fvd_acos_fma3_256(__m256d const a)30 __m256d __fvd_acos_fma3_256(__m256d const a)
31 {
32     __m256i const ABS_MASK  = _mm256_set1_epi64x(ABS_MASK_LL);
33     __m256d const ZERO      = _mm256_set1_pd(0.0);
34     __m256d const ONE       = _mm256_set1_pd(1.0);
35     __m256d const SGN_MASK  = (__m256d)_mm256_set1_epi64x(SGN_MASK_LL);
36     __m256d const THRESHOLD = _mm256_set1_pd(THRESHOLD_D);
37     __m256d const PI_HI     = _mm256_set1_pd(PI_HI_D);
38 
39     __m256d const A0 = _mm256_set1_pd(A0_D);
40     __m256d const B0 = _mm256_set1_pd(B0_D);
41     __m256d const C0 = _mm256_set1_pd(C0_D);
42     __m256d const D0 = _mm256_set1_pd(D0_D);
43     __m256d const E0 = _mm256_set1_pd(E0_D);
44     __m256d const F0 = _mm256_set1_pd(F0_D);
45     __m256d const G0 = _mm256_set1_pd(G0_D);
46     __m256d const H0 = _mm256_set1_pd(H0_D);
47     __m256d const I0 = _mm256_set1_pd(I0_D);
48     __m256d const J0 = _mm256_set1_pd(J0_D);
49     __m256d const K0 = _mm256_set1_pd(K0_D);
50     __m256d const L0 = _mm256_set1_pd(L0_D);
51     __m256d const M0 = _mm256_set1_pd(M0_D);
52     __m256d const N0 = _mm256_set1_pd(N0_D);
53 
54     __m256d const A1 = _mm256_set1_pd(A1_D);
55     __m256d const B1 = _mm256_set1_pd(B1_D);
56     __m256d const C1 = _mm256_set1_pd(C1_D);
57     __m256d const D1 = _mm256_set1_pd(D1_D);
58     __m256d const E1 = _mm256_set1_pd(E1_D);
59     __m256d const F1 = _mm256_set1_pd(F1_D);
60     __m256d const G1 = _mm256_set1_pd(G1_D);
61     __m256d const H1 = _mm256_set1_pd(H1_D);
62     __m256d const I1 = _mm256_set1_pd(I1_D);
63     __m256d const J1 = _mm256_set1_pd(J1_D);
64     __m256d const K1 = _mm256_set1_pd(K1_D);
65     __m256d const L1 = _mm256_set1_pd(L1_D);
66     __m256d const M1 = _mm256_set1_pd(M1_D);
67 
68     __m256d x, x2, a3, x6, x12, a15, c;
69     __m256d sq, p0, p1;
70     __m256d res, cmp, sign, fix;
71     __m256d p0hi, p0lo, p1hi, p1lo;
72 
73     x  = _mm256_and_pd(a, (__m256d)ABS_MASK);
74     x2 = _mm256_mul_pd(a, a);
75     sq = _mm256_sub_pd(ONE, x);
76     sq = _mm256_sqrt_pd(sq);
77 
78     p1hi = _mm256_fmadd_pd(A1, x, B1);
79 
80     p1hi = _mm256_fmadd_pd(p1hi, x, C1);
81     p1lo = _mm256_fmadd_pd(H1, x, I1);
82 
83     p1hi = _mm256_fmadd_pd(p1hi, x, D1);
84     p1lo = _mm256_fmadd_pd(p1lo, x, J1);
85     p0hi = _mm256_fmadd_pd(A0, x2, B0);
86     p0lo = _mm256_fmadd_pd(H0, x2, I0);
87 
88     p1hi = _mm256_fmadd_pd(p1hi, x, E1);
89     p1lo = _mm256_fmadd_pd(p1lo, x, K1);
90     p0hi = _mm256_fmadd_pd(p0hi, x2, C0);
91     p0lo = _mm256_fmadd_pd(p0lo, x2, J0);
92 
93     a3 = _mm256_mul_pd(x2, a);
94     p1hi = _mm256_fmadd_pd(p1hi, x, F1);
95     p1lo = _mm256_fmadd_pd(p1lo, x, L1);
96     p0hi = _mm256_fmadd_pd(p0hi, x2, D0);
97     p0lo = _mm256_fmadd_pd(p0lo, x2, K0);
98 
99     p1hi = _mm256_fmadd_pd(p1hi, x, G1);
100     x6 = _mm256_mul_pd(a3, a3);
101     p1lo = _mm256_fmadd_pd(p1lo, x, M1);
102     __m256d pi_mask = _mm256_cmp_pd(ZERO, a, _CMP_GT_OQ);
103     fix = _mm256_cmp_pd(a, ONE, _CMP_GT_OQ);
104     p0hi = _mm256_fmadd_pd(p0hi, x2, E0);
105     p0lo = _mm256_fmadd_pd(p0lo, x2, L0);
106 
107     p1 = _mm256_fmadd_pd(p1hi, x6, p1lo);
108     __m256d pi_hi = _mm256_and_pd(pi_mask, PI_HI);
109     fix = _mm256_and_pd(fix, SGN_MASK);
110     sign = _mm256_and_pd(a, SGN_MASK);
111     p0hi = _mm256_fmadd_pd(p0hi, x2, F0);
112     x12 = _mm256_mul_pd(x6, x6);
113     p0lo = _mm256_fmadd_pd(p0lo, x2, M0);
114     c = _mm256_sub_pd(N0, a);
115 
116     p1 = _mm256_fmsub_pd(sq, p1, pi_hi);
117     fix = _mm256_xor_pd(fix, sign);
118     p0hi = _mm256_fmadd_pd(p0hi, x2, G0);
119     a15 = _mm256_mul_pd(x12, a3);
120     p0lo = _mm256_fmadd_pd(p0lo, a3, c);
121 
122     p1 = _mm256_xor_pd(p1, fix);
123     p0 = _mm256_fmadd_pd(p0hi, a15, p0lo);
124     cmp = _mm256_cmp_pd(x, THRESHOLD, _CMP_LT_OQ);
125 
126     res = _mm256_blendv_pd(p1, p0, cmp);
127 
128     return res;
129 }
130