1
2 /*
3 * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19 #if defined(TARGET_LINUX_POWER)
20 #error "Source cannot be compiled for POWER architectures"
21 #include "xmm2altivec.h"
22 #else
23 #include <immintrin.h>
24 #endif
25 #include "acos_defs.h"
26 #include "mth_avx512helper.h"
27
28 extern "C" __m512 FCN_AVX512(__fvs_acos_fma3)(__m512 const a);
29
FCN_AVX512(__fvs_acos_fma3)30 __m512 FCN_AVX512(__fvs_acos_fma3)(__m512 const a)
31 {
32 __m512 const ABS_MASK = (__m512)_mm512_set1_epi32(ABS_MASK_I);
33 __m512 const SGN_MASK = (__m512)_mm512_set1_epi32(SGN_MASK_I);
34 __m512 const ONE = _mm512_set1_ps(1.0f);
35 __m512i const ZERO = _mm512_set1_epi32(0);
36 __m512i const THRESHOLD = (__m512i)_mm512_set1_ps(THRESHOLD_F);
37 __m512 const PI = _mm512_set1_ps(PI_F);
38
39 // p0 coefficients
40 __m512 const A0 = _mm512_set1_ps(A0_F);
41 __m512 const B0 = _mm512_set1_ps(B0_F);
42 __m512 const C0 = _mm512_set1_ps(C0_F);
43 __m512 const D0 = _mm512_set1_ps(D0_F);
44 __m512 const E0 = _mm512_set1_ps(E0_F);
45 __m512 const F0 = _mm512_set1_ps(F0_F);
46
47 // p1 coefficients
48 __m512 const A1 = _mm512_set1_ps(A1_F);
49 __m512 const B1 = _mm512_set1_ps(B1_F);
50 __m512 const C1 = _mm512_set1_ps(C1_F);
51 __m512 const D1 = _mm512_set1_ps(D1_F);
52 __m512 const E1 = _mm512_set1_ps(E1_F);
53 __m512 const F1 = _mm512_set1_ps(F1_F);
54
55 __m512 x, x2, a3, sq, p0, p1, res, c, cmp0;
56 x = _MM512_AND_PS(ABS_MASK, a);
57 sq = _mm512_sub_ps(ONE, x);
58 sq = _mm512_sqrt_ps(sq); // sqrt(1 - |a|)
59
60 __m512 pi_mask = (__m512)_MM512_CMPGT_EPI32(ZERO, (__m512i)a);
61 cmp0 = (__m512)_MM512_CMPGT_EPI32((__m512i)x, THRESHOLD);
62
63 // polynomials evaluation
64 x2 = _mm512_mul_ps(a, a);
65 c = _mm512_sub_ps(F0, a);
66 p1 = _mm512_fmadd_ps(A1, x, B1);
67 p0 = _mm512_fmadd_ps(A0, x2, B0);
68 p1 = _mm512_fmadd_ps(p1, x, C1);
69 p0 = _mm512_fmadd_ps(p0, x2, C0);
70 p1 = _mm512_fmadd_ps(p1, x, D1);
71 a3 = _mm512_mul_ps(x2, a);
72 p0 = _mm512_fmadd_ps(p0, x2, D0);
73 p1 = _mm512_fmadd_ps(p1, x, E1);
74 p0 = _mm512_fmadd_ps(p0, x2, E0);
75 p1 = _mm512_fmadd_ps(p1, x, F1);
76 p0 = _mm512_fmadd_ps(p0, a3, c);
77
78 pi_mask = _MM512_AND_PS(pi_mask, PI);
79 p1 = _mm512_fmsub_ps(sq, p1, pi_mask);
80
81 __m512 sign;
82 sign = _MM512_AND_PS(a, SGN_MASK);
83
84 __m512 fix;
85 fix = _MM512_CMP_PS(a, ONE, _CMP_GT_OQ);
86 fix = _MM512_AND_PS(fix, SGN_MASK);
87 fix = _MM512_XOR_PS(fix, sign);
88 p1 = _MM512_XOR_PS(p1, fix);
89
90 res = _MM512_BLENDV_PS(p0, p1, cmp0);
91
92 return res;
93 }
94