1 
2 /*
3  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 #if defined(TARGET_LINUX_POWER)
20 #include "xmm2altivec.h"
21 #elif defined(TARGET_LINUX_ARM64)
22 #include "arm64intrin.h"
23 #else
24 #include <immintrin.h>
25 #endif
26 #include "acos_defs.h"
27 
28 extern "C" __m128 __fvs_acos_fma3(__m128 const a);
29 
__fvs_acos_fma3(__m128 const a)30 __m128 __fvs_acos_fma3(__m128 const a)
31 {
32     __m128  const ABS_MASK      = (__m128)_mm_set1_epi32(ABS_MASK_I);
33     __m128  const SGN_MASK      = (__m128)_mm_set1_epi32(SGN_MASK_I);
34     __m128  const ONE           = _mm_set1_ps(1.0f);
35     __m128i const ZERO          = _mm_set1_epi32(0);
36     __m128i const THRESHOLD     = (__m128i)_mm_set1_ps(THRESHOLD_F);
37     __m128  const PI            = _mm_set1_ps(PI_F);
38 
39     // p0 coefficients
40     __m128 const A0 = _mm_set1_ps(A0_F);
41     __m128 const B0 = _mm_set1_ps(B0_F);
42     __m128 const C0 = _mm_set1_ps(C0_F);
43     __m128 const D0 = _mm_set1_ps(D0_F);
44     __m128 const E0 = _mm_set1_ps(E0_F);
45     __m128 const F0 = _mm_set1_ps(F0_F);
46 
47     // p1 coefficients
48     __m128 const A1 = _mm_set1_ps(A1_F);
49     __m128 const B1 = _mm_set1_ps(B1_F);
50     __m128 const C1 = _mm_set1_ps(C1_F);
51     __m128 const D1 = _mm_set1_ps(D1_F);
52     __m128 const E1 = _mm_set1_ps(E1_F);
53     __m128 const F1 = _mm_set1_ps(F1_F);
54 
55     __m128 x, x2, a3, sq, p0, p1, res, c, cmp0;
56     x = _mm_and_ps(ABS_MASK, a);
57     sq = _mm_sub_ps(ONE, x);
58     sq = _mm_sqrt_ps(sq); // sqrt(1 - |a|)
59 
60     __m128 pi_mask = (__m128)_mm_cmpgt_epi32(ZERO, (__m128i)a);
61     cmp0 = (__m128)_mm_cmpgt_epi32((__m128i)x, THRESHOLD);
62 
63     // polynomials evaluation
64     x2 = _mm_mul_ps(a, a);
65     c  = _mm_sub_ps(F0, a);
66     p1 = _mm_fmadd_ps(A1, x, B1);
67     p0 = _mm_fmadd_ps(A0, x2, B0);
68     p1 = _mm_fmadd_ps(p1, x, C1);
69     p0 = _mm_fmadd_ps(p0, x2, C0);
70     p1 = _mm_fmadd_ps(p1, x, D1);
71     a3 = _mm_mul_ps(x2, a);
72     p0 = _mm_fmadd_ps(p0, x2, D0);
73     p1 = _mm_fmadd_ps(p1, x, E1);
74     p0 = _mm_fmadd_ps(p0, x2, E0);
75     p1 = _mm_fmadd_ps(p1, x, F1);
76     p0 = _mm_fmadd_ps(p0, a3, c);
77 
78     pi_mask = _mm_and_ps(pi_mask, PI);
79     p1 = _mm_fmsub_ps(sq, p1, pi_mask);
80 
81     __m128 sign;
82     sign = _mm_and_ps(a, SGN_MASK);
83 
84     __m128 fix;
85     fix = _mm_cmp_ps(a, ONE, _CMP_GT_OQ);
86     fix = _mm_and_ps(fix, SGN_MASK);
87     fix = _mm_xor_ps(fix, sign);
88     p1 = _mm_xor_ps(p1, fix);
89 
90     res = _mm_blendv_ps(p0, p1, cmp0);
91 
92     return res;
93 }
94