1 
2 /*
3  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 #if defined(TARGET_LINUX_POWER)
21 #include "xmm2altivec.h"
22 #elif defined(TARGET_LINUX_ARM64)
23 #include "arm64intrin.h"
24 #else
25 #include <immintrin.h>
26 #endif
27 #include "dacos_defs.h"
28 
29 extern "C" __m128d __fvd_acos_fma3(__m128d);
30 
__fvd_acos_fma3(__m128d const a)31 __m128d __fvd_acos_fma3(__m128d const a)
32 {
33     __m128i const ABS_MASK  = _mm_set1_epi64x(ABS_MASK_LL);
34     __m128d const ZERO      = _mm_set1_pd(0.0);
35     __m128d const ONE       = _mm_set1_pd(1.0);
36     __m128d const SGN_MASK  = (__m128d)_mm_set1_epi64x(SGN_MASK_LL);
37     __m128d const THRESHOLD = _mm_set1_pd(THRESHOLD_D);
38     __m128d const PI_HI     = _mm_set1_pd(PI_HI_D);
39 
40     __m128d const A0 = _mm_set1_pd(A0_D);
41     __m128d const B0 = _mm_set1_pd(B0_D);
42     __m128d const C0 = _mm_set1_pd(C0_D);
43     __m128d const D0 = _mm_set1_pd(D0_D);
44     __m128d const E0 = _mm_set1_pd(E0_D);
45     __m128d const F0 = _mm_set1_pd(F0_D);
46     __m128d const G0 = _mm_set1_pd(G0_D);
47     __m128d const H0 = _mm_set1_pd(H0_D);
48     __m128d const I0 = _mm_set1_pd(I0_D);
49     __m128d const J0 = _mm_set1_pd(J0_D);
50     __m128d const K0 = _mm_set1_pd(K0_D);
51     __m128d const L0 = _mm_set1_pd(L0_D);
52     __m128d const M0 = _mm_set1_pd(M0_D);
53     __m128d const N0 = _mm_set1_pd(N0_D);
54 
55     __m128d const A1 = _mm_set1_pd(A1_D);
56     __m128d const B1 = _mm_set1_pd(B1_D);
57     __m128d const C1 = _mm_set1_pd(C1_D);
58     __m128d const D1 = _mm_set1_pd(D1_D);
59     __m128d const E1 = _mm_set1_pd(E1_D);
60     __m128d const F1 = _mm_set1_pd(F1_D);
61     __m128d const G1 = _mm_set1_pd(G1_D);
62     __m128d const H1 = _mm_set1_pd(H1_D);
63     __m128d const I1 = _mm_set1_pd(I1_D);
64     __m128d const J1 = _mm_set1_pd(J1_D);
65     __m128d const K1 = _mm_set1_pd(K1_D);
66     __m128d const L1 = _mm_set1_pd(L1_D);
67     __m128d const M1 = _mm_set1_pd(M1_D);
68 
69     __m128d x, x2, a3, x6, x12, a15, c;
70     __m128d sq, p0, p1;
71     __m128d res, cmp, sign, fix;
72     __m128d p0hi, p0lo, p1hi, p1lo;
73 
74     x  = _mm_and_pd(a, (__m128d)ABS_MASK);
75     x2 = _mm_mul_pd(a, a);
76     sq = _mm_sub_pd(ONE, x);
77     sq = _mm_sqrt_pd(sq);
78 
79     p1hi = _mm_fmadd_pd(A1, x, B1);
80 
81     p1hi = _mm_fmadd_pd(p1hi, x, C1);
82     p1lo = _mm_fmadd_pd(H1, x, I1);
83 
84     p1hi = _mm_fmadd_pd(p1hi, x, D1);
85     p1lo = _mm_fmadd_pd(p1lo, x, J1);
86     p0hi = _mm_fmadd_pd(A0, x2, B0);
87     p0lo = _mm_fmadd_pd(H0, x2, I0);
88 
89     p1hi = _mm_fmadd_pd(p1hi, x, E1);
90     p1lo = _mm_fmadd_pd(p1lo, x, K1);
91     p0hi = _mm_fmadd_pd(p0hi, x2, C0);
92     p0lo = _mm_fmadd_pd(p0lo, x2, J0);
93 
94     a3 = _mm_mul_pd(x2, a);
95     p1hi = _mm_fmadd_pd(p1hi, x, F1);
96     p1lo = _mm_fmadd_pd(p1lo, x, L1);
97     p0hi = _mm_fmadd_pd(p0hi, x2, D0);
98     p0lo = _mm_fmadd_pd(p0lo, x2, K0);
99 
100     p1hi = _mm_fmadd_pd(p1hi, x, G1);
101     x6 = _mm_mul_pd(a3, a3);
102     p1lo = _mm_fmadd_pd(p1lo, x, M1);
103     __m128d pi_mask = _mm_cmp_pd(ZERO, a, _CMP_GT_OQ);
104     fix = _mm_cmp_pd(a, ONE, _CMP_GT_OQ);
105     p0hi = _mm_fmadd_pd(p0hi, x2, E0);
106     p0lo = _mm_fmadd_pd(p0lo, x2, L0);
107 
108     p1 = _mm_fmadd_pd(p1hi, x6, p1lo);
109     __m128d pi_hi = _mm_and_pd(pi_mask, PI_HI);
110     fix = _mm_and_pd(fix, SGN_MASK);
111     sign = _mm_and_pd(a, SGN_MASK);
112     p0hi = _mm_fmadd_pd(p0hi, x2, F0);
113     x12 = _mm_mul_pd(x6, x6);
114     p0lo = _mm_fmadd_pd(p0lo, x2, M0);
115     c = _mm_sub_pd(N0, a);
116 
117     p1 = _mm_fmsub_pd(sq, p1, pi_hi);
118     fix = _mm_xor_pd(fix, sign);
119     p0hi = _mm_fmadd_pd(p0hi, x2, G0);
120     a15 = _mm_mul_pd(x12, a3);
121     p0lo = _mm_fmadd_pd(p0lo, a3, c);
122 
123     p1 = _mm_xor_pd(p1, fix);
124     p0 = _mm_fmadd_pd(p0hi, a15, p0lo);
125     cmp = _mm_cmp_pd(x, THRESHOLD, _CMP_LT_OQ);
126 
127     res = _mm_blendv_pd(p1, p0, cmp);
128 
129     return res;
130 }
131 
132