1
2 /*
3 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19
20 #if defined(TARGET_LINUX_POWER)
21 #include "xmm2altivec.h"
22 #elif defined(TARGET_LINUX_ARM64)
23 #include "arm64intrin.h"
24 #else
25 #include <immintrin.h>
26 #endif
27 #include "dacos_defs.h"
28
29 extern "C" __m128d __fvd_acos_fma3(__m128d);
30
__fvd_acos_fma3(__m128d const a)31 __m128d __fvd_acos_fma3(__m128d const a)
32 {
33 __m128i const ABS_MASK = _mm_set1_epi64x(ABS_MASK_LL);
34 __m128d const ZERO = _mm_set1_pd(0.0);
35 __m128d const ONE = _mm_set1_pd(1.0);
36 __m128d const SGN_MASK = (__m128d)_mm_set1_epi64x(SGN_MASK_LL);
37 __m128d const THRESHOLD = _mm_set1_pd(THRESHOLD_D);
38 __m128d const PI_HI = _mm_set1_pd(PI_HI_D);
39
40 __m128d const A0 = _mm_set1_pd(A0_D);
41 __m128d const B0 = _mm_set1_pd(B0_D);
42 __m128d const C0 = _mm_set1_pd(C0_D);
43 __m128d const D0 = _mm_set1_pd(D0_D);
44 __m128d const E0 = _mm_set1_pd(E0_D);
45 __m128d const F0 = _mm_set1_pd(F0_D);
46 __m128d const G0 = _mm_set1_pd(G0_D);
47 __m128d const H0 = _mm_set1_pd(H0_D);
48 __m128d const I0 = _mm_set1_pd(I0_D);
49 __m128d const J0 = _mm_set1_pd(J0_D);
50 __m128d const K0 = _mm_set1_pd(K0_D);
51 __m128d const L0 = _mm_set1_pd(L0_D);
52 __m128d const M0 = _mm_set1_pd(M0_D);
53 __m128d const N0 = _mm_set1_pd(N0_D);
54
55 __m128d const A1 = _mm_set1_pd(A1_D);
56 __m128d const B1 = _mm_set1_pd(B1_D);
57 __m128d const C1 = _mm_set1_pd(C1_D);
58 __m128d const D1 = _mm_set1_pd(D1_D);
59 __m128d const E1 = _mm_set1_pd(E1_D);
60 __m128d const F1 = _mm_set1_pd(F1_D);
61 __m128d const G1 = _mm_set1_pd(G1_D);
62 __m128d const H1 = _mm_set1_pd(H1_D);
63 __m128d const I1 = _mm_set1_pd(I1_D);
64 __m128d const J1 = _mm_set1_pd(J1_D);
65 __m128d const K1 = _mm_set1_pd(K1_D);
66 __m128d const L1 = _mm_set1_pd(L1_D);
67 __m128d const M1 = _mm_set1_pd(M1_D);
68
69 __m128d x, x2, a3, x6, x12, a15, c;
70 __m128d sq, p0, p1;
71 __m128d res, cmp, sign, fix;
72 __m128d p0hi, p0lo, p1hi, p1lo;
73
74 x = _mm_and_pd(a, (__m128d)ABS_MASK);
75 x2 = _mm_mul_pd(a, a);
76 sq = _mm_sub_pd(ONE, x);
77 sq = _mm_sqrt_pd(sq);
78
79 p1hi = _mm_fmadd_pd(A1, x, B1);
80
81 p1hi = _mm_fmadd_pd(p1hi, x, C1);
82 p1lo = _mm_fmadd_pd(H1, x, I1);
83
84 p1hi = _mm_fmadd_pd(p1hi, x, D1);
85 p1lo = _mm_fmadd_pd(p1lo, x, J1);
86 p0hi = _mm_fmadd_pd(A0, x2, B0);
87 p0lo = _mm_fmadd_pd(H0, x2, I0);
88
89 p1hi = _mm_fmadd_pd(p1hi, x, E1);
90 p1lo = _mm_fmadd_pd(p1lo, x, K1);
91 p0hi = _mm_fmadd_pd(p0hi, x2, C0);
92 p0lo = _mm_fmadd_pd(p0lo, x2, J0);
93
94 a3 = _mm_mul_pd(x2, a);
95 p1hi = _mm_fmadd_pd(p1hi, x, F1);
96 p1lo = _mm_fmadd_pd(p1lo, x, L1);
97 p0hi = _mm_fmadd_pd(p0hi, x2, D0);
98 p0lo = _mm_fmadd_pd(p0lo, x2, K0);
99
100 p1hi = _mm_fmadd_pd(p1hi, x, G1);
101 x6 = _mm_mul_pd(a3, a3);
102 p1lo = _mm_fmadd_pd(p1lo, x, M1);
103 __m128d pi_mask = _mm_cmp_pd(ZERO, a, _CMP_GT_OQ);
104 fix = _mm_cmp_pd(a, ONE, _CMP_GT_OQ);
105 p0hi = _mm_fmadd_pd(p0hi, x2, E0);
106 p0lo = _mm_fmadd_pd(p0lo, x2, L0);
107
108 p1 = _mm_fmadd_pd(p1hi, x6, p1lo);
109 __m128d pi_hi = _mm_and_pd(pi_mask, PI_HI);
110 fix = _mm_and_pd(fix, SGN_MASK);
111 sign = _mm_and_pd(a, SGN_MASK);
112 p0hi = _mm_fmadd_pd(p0hi, x2, F0);
113 x12 = _mm_mul_pd(x6, x6);
114 p0lo = _mm_fmadd_pd(p0lo, x2, M0);
115 c = _mm_sub_pd(N0, a);
116
117 p1 = _mm_fmsub_pd(sq, p1, pi_hi);
118 fix = _mm_xor_pd(fix, sign);
119 p0hi = _mm_fmadd_pd(p0hi, x2, G0);
120 a15 = _mm_mul_pd(x12, a3);
121 p0lo = _mm_fmadd_pd(p0lo, a3, c);
122
123 p1 = _mm_xor_pd(p1, fix);
124 p0 = _mm_fmadd_pd(p0hi, a15, p0lo);
125 cmp = _mm_cmp_pd(x, THRESHOLD, _CMP_LT_OQ);
126
127 res = _mm_blendv_pd(p1, p0, cmp);
128
129 return res;
130 }
131
132