1 
2 /*
3  * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 
20 #if defined(TARGET_LINUX_POWER)
21 #include "xmm2altivec.h"
22 #elif defined(TARGET_LINUX_ARM64)
23 #include "arm64intrin.h"
24 #else
25 #include <immintrin.h>
26 #endif
27 #include <math.h>
28 #include "acos_defs.h"
29 
30 extern "C" float __fss_acos_fma3(float);
31 
__float_as_int(float const a)32 static int __float_as_int(float const a) {
33     return *(int*)&a;
34 }
35 
__int_as_float(int const a)36 static float __int_as_float(int const a) {
37     return *(float*)&a;
38 }
39 
__fss_acos_fma3(float const a)40 float __fss_acos_fma3(float const a)
41 {
42     __m128i const ZERO          = _mm_set1_epi32(0);
43     __m128  const PI            = _mm_set1_ps(PI_F);
44 
45     // p0, p1 coefficients
46     __m128 const A = _mm_setr_ps(A0_F, A1_F, 0.0f, 0.0f);
47     __m128 const B = _mm_setr_ps(B0_F, B1_F, 0.0f, 0.0f);
48     __m128 const C = _mm_setr_ps(C0_F, C1_F, 0.0f, 0.0f);
49     __m128 const D = _mm_setr_ps(D0_F, D1_F, 0.0f, 0.0f);
50     __m128 const E = _mm_setr_ps(E0_F, E1_F, 0.0f, 0.0f);
51     __m128 const F = _mm_setr_ps(F0_F, F1_F, 0.0f, 0.0f);
52 
53     __m128 _x2_x, _a, _a3, p, p0, p1, _sq, _c;
54     float x, sq, res;
55     x = __int_as_float(ABS_MASK_I & __float_as_int(a));
56     _x2_x = _mm_setr_ps(a * a, x, 0.0f, 0.0f);
57     _a = _mm_set1_ps(a);
58     _a3 = _mm_mul_ps(_x2_x, _a);
59     _c = _mm_sub_ps(F, _a);
60 
61     p = _mm_fmadd_ps(A, _x2_x, B);
62     p = _mm_fmadd_ps(p, _x2_x, C);
63     p = _mm_fmadd_ps(p, _x2_x, D);
64     p = _mm_fmadd_ps(p, _x2_x, E);
65     p0 = _mm_fmadd_ps(p, _a3, _c);
66     res = __int_as_float(_mm_extract_ps(p0, 0));
67 
68     if (__float_as_int(x) > __float_as_int(THRESHOLD_F))
69     {
70         sq = 1.0f - x;
71 	/*
72 	 * There seems to be a concensus that setting errno is important
73 	 * for fastmath intrinsics.
74 	 * Disable using Intel hardware instruction sqrt.
75 	 */
76 	sq = sqrtf(sq);
77         _sq = _mm_setr_ps(0.0f, sq, 0.0f, 0.0f);
78         p1 = _mm_fmadd_ps(p, _x2_x, F);
79 
80         __m128 pi_mask = (__m128)_mm_cmpgt_epi32(ZERO, (__m128i)_a);
81         pi_mask = _mm_and_ps(pi_mask, PI);
82         p1 = _mm_fmsub_ps(_sq, p1, pi_mask);
83 
84         res = __int_as_float(_mm_extract_ps(p1, 1));
85 
86         int sign;
87         sign = SGN_MASK_I & __float_as_int(a);
88 
89         int fix;
90         fix = (a > 1.0f) << 31;
91         fix ^= sign;
92         res = __int_as_float(__float_as_int(res) ^ fix);
93     }
94 
95     return res;
96 }
97