1
2 /*
3 * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19
20 #if defined(TARGET_LINUX_POWER)
21 #include "xmm2altivec.h"
22 #elif defined(TARGET_LINUX_ARM64)
23 #include "arm64intrin.h"
24 #else
25 #include <immintrin.h>
26 #endif
27 #include <math.h>
28 #include "acos_defs.h"
29
30 extern "C" float __fss_acos_fma3(float);
31
__float_as_int(float const a)32 static int __float_as_int(float const a) {
33 return *(int*)&a;
34 }
35
__int_as_float(int const a)36 static float __int_as_float(int const a) {
37 return *(float*)&a;
38 }
39
__fss_acos_fma3(float const a)40 float __fss_acos_fma3(float const a)
41 {
42 __m128i const ZERO = _mm_set1_epi32(0);
43 __m128 const PI = _mm_set1_ps(PI_F);
44
45 // p0, p1 coefficients
46 __m128 const A = _mm_setr_ps(A0_F, A1_F, 0.0f, 0.0f);
47 __m128 const B = _mm_setr_ps(B0_F, B1_F, 0.0f, 0.0f);
48 __m128 const C = _mm_setr_ps(C0_F, C1_F, 0.0f, 0.0f);
49 __m128 const D = _mm_setr_ps(D0_F, D1_F, 0.0f, 0.0f);
50 __m128 const E = _mm_setr_ps(E0_F, E1_F, 0.0f, 0.0f);
51 __m128 const F = _mm_setr_ps(F0_F, F1_F, 0.0f, 0.0f);
52
53 __m128 _x2_x, _a, _a3, p, p0, p1, _sq, _c;
54 float x, sq, res;
55 x = __int_as_float(ABS_MASK_I & __float_as_int(a));
56 _x2_x = _mm_setr_ps(a * a, x, 0.0f, 0.0f);
57 _a = _mm_set1_ps(a);
58 _a3 = _mm_mul_ps(_x2_x, _a);
59 _c = _mm_sub_ps(F, _a);
60
61 p = _mm_fmadd_ps(A, _x2_x, B);
62 p = _mm_fmadd_ps(p, _x2_x, C);
63 p = _mm_fmadd_ps(p, _x2_x, D);
64 p = _mm_fmadd_ps(p, _x2_x, E);
65 p0 = _mm_fmadd_ps(p, _a3, _c);
66 res = __int_as_float(_mm_extract_ps(p0, 0));
67
68 if (__float_as_int(x) > __float_as_int(THRESHOLD_F))
69 {
70 sq = 1.0f - x;
71 /*
72 * There seems to be a concensus that setting errno is important
73 * for fastmath intrinsics.
74 * Disable using Intel hardware instruction sqrt.
75 */
76 sq = sqrtf(sq);
77 _sq = _mm_setr_ps(0.0f, sq, 0.0f, 0.0f);
78 p1 = _mm_fmadd_ps(p, _x2_x, F);
79
80 __m128 pi_mask = (__m128)_mm_cmpgt_epi32(ZERO, (__m128i)_a);
81 pi_mask = _mm_and_ps(pi_mask, PI);
82 p1 = _mm_fmsub_ps(_sq, p1, pi_mask);
83
84 res = __int_as_float(_mm_extract_ps(p1, 1));
85
86 int sign;
87 sign = SGN_MASK_I & __float_as_int(a);
88
89 int fix;
90 fix = (a > 1.0f) << 31;
91 fix ^= sign;
92 res = __int_as_float(__float_as_int(res) ^ fix);
93 }
94
95 return res;
96 }
97